iommu/vt-d: Clean up size handling for intel_iommu_unmap()
[deliverable/linux.git] / drivers / iommu / intel-iommu.c
1 /*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
22 */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/memory.h>
37 #include <linux/timer.h>
38 #include <linux/iova.h>
39 #include <linux/iommu.h>
40 #include <linux/intel-iommu.h>
41 #include <linux/syscore_ops.h>
42 #include <linux/tboot.h>
43 #include <linux/dmi.h>
44 #include <linux/pci-ats.h>
45 #include <linux/memblock.h>
46 #include <asm/irq_remapping.h>
47 #include <asm/cacheflush.h>
48 #include <asm/iommu.h>
49
50 #include "irq_remapping.h"
51 #include "pci.h"
52
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
58 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
59
60 #define IOAPIC_RANGE_START (0xfee00000)
61 #define IOAPIC_RANGE_END (0xfeefffff)
62 #define IOVA_START_ADDR (0x1000)
63
64 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
65
66 #define MAX_AGAW_WIDTH 64
67 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
68
69 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
70 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
71
72 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
73 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
74 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
75 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
76 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
77
78 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
79 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
80 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
81
82 /* page table handling */
83 #define LEVEL_STRIDE (9)
84 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
85
86 /*
87 * This bitmap is used to advertise the page sizes our hardware support
88 * to the IOMMU core, which will then use this information to split
89 * physically contiguous memory regions it is mapping into page sizes
90 * that we support.
91 *
92 * Traditionally the IOMMU core just handed us the mappings directly,
93 * after making sure the size is an order of a 4KiB page and that the
94 * mapping has natural alignment.
95 *
96 * To retain this behavior, we currently advertise that we support
97 * all page sizes that are an order of 4KiB.
98 *
99 * If at some point we'd like to utilize the IOMMU core's new behavior,
100 * we could change this to advertise the real page sizes we support.
101 */
102 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
103
104 static inline int agaw_to_level(int agaw)
105 {
106 return agaw + 2;
107 }
108
109 static inline int agaw_to_width(int agaw)
110 {
111 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
112 }
113
114 static inline int width_to_agaw(int width)
115 {
116 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
117 }
118
119 static inline unsigned int level_to_offset_bits(int level)
120 {
121 return (level - 1) * LEVEL_STRIDE;
122 }
123
124 static inline int pfn_level_offset(unsigned long pfn, int level)
125 {
126 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
127 }
128
129 static inline unsigned long level_mask(int level)
130 {
131 return -1UL << level_to_offset_bits(level);
132 }
133
134 static inline unsigned long level_size(int level)
135 {
136 return 1UL << level_to_offset_bits(level);
137 }
138
139 static inline unsigned long align_to_level(unsigned long pfn, int level)
140 {
141 return (pfn + level_size(level) - 1) & level_mask(level);
142 }
143
144 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
145 {
146 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
147 }
148
149 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
150 are never going to work. */
151 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
152 {
153 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 }
155
156 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
157 {
158 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
159 }
160 static inline unsigned long page_to_dma_pfn(struct page *pg)
161 {
162 return mm_to_dma_pfn(page_to_pfn(pg));
163 }
164 static inline unsigned long virt_to_dma_pfn(void *p)
165 {
166 return page_to_dma_pfn(virt_to_page(p));
167 }
168
169 /* global iommu list, set NULL for ignored DMAR units */
170 static struct intel_iommu **g_iommus;
171
172 static void __init check_tylersburg_isoch(void);
173 static int rwbf_quirk;
174
175 /*
176 * set to 1 to panic kernel if can't successfully enable VT-d
177 * (used when kernel is launched w/ TXT)
178 */
179 static int force_on = 0;
180
181 /*
182 * 0: Present
183 * 1-11: Reserved
184 * 12-63: Context Ptr (12 - (haw-1))
185 * 64-127: Reserved
186 */
187 struct root_entry {
188 u64 val;
189 u64 rsvd1;
190 };
191 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
192 static inline bool root_present(struct root_entry *root)
193 {
194 return (root->val & 1);
195 }
196 static inline void set_root_present(struct root_entry *root)
197 {
198 root->val |= 1;
199 }
200 static inline void set_root_value(struct root_entry *root, unsigned long value)
201 {
202 root->val |= value & VTD_PAGE_MASK;
203 }
204
205 static inline struct context_entry *
206 get_context_addr_from_root(struct root_entry *root)
207 {
208 return (struct context_entry *)
209 (root_present(root)?phys_to_virt(
210 root->val & VTD_PAGE_MASK) :
211 NULL);
212 }
213
214 /*
215 * low 64 bits:
216 * 0: present
217 * 1: fault processing disable
218 * 2-3: translation type
219 * 12-63: address space root
220 * high 64 bits:
221 * 0-2: address width
222 * 3-6: aval
223 * 8-23: domain id
224 */
225 struct context_entry {
226 u64 lo;
227 u64 hi;
228 };
229
230 static inline bool context_present(struct context_entry *context)
231 {
232 return (context->lo & 1);
233 }
234 static inline void context_set_present(struct context_entry *context)
235 {
236 context->lo |= 1;
237 }
238
239 static inline void context_set_fault_enable(struct context_entry *context)
240 {
241 context->lo &= (((u64)-1) << 2) | 1;
242 }
243
244 static inline void context_set_translation_type(struct context_entry *context,
245 unsigned long value)
246 {
247 context->lo &= (((u64)-1) << 4) | 3;
248 context->lo |= (value & 3) << 2;
249 }
250
251 static inline void context_set_address_root(struct context_entry *context,
252 unsigned long value)
253 {
254 context->lo |= value & VTD_PAGE_MASK;
255 }
256
257 static inline void context_set_address_width(struct context_entry *context,
258 unsigned long value)
259 {
260 context->hi |= value & 7;
261 }
262
263 static inline void context_set_domain_id(struct context_entry *context,
264 unsigned long value)
265 {
266 context->hi |= (value & ((1 << 16) - 1)) << 8;
267 }
268
269 static inline void context_clear_entry(struct context_entry *context)
270 {
271 context->lo = 0;
272 context->hi = 0;
273 }
274
275 /*
276 * 0: readable
277 * 1: writable
278 * 2-6: reserved
279 * 7: super page
280 * 8-10: available
281 * 11: snoop behavior
282 * 12-63: Host physcial address
283 */
284 struct dma_pte {
285 u64 val;
286 };
287
288 static inline void dma_clear_pte(struct dma_pte *pte)
289 {
290 pte->val = 0;
291 }
292
293 static inline u64 dma_pte_addr(struct dma_pte *pte)
294 {
295 #ifdef CONFIG_64BIT
296 return pte->val & VTD_PAGE_MASK;
297 #else
298 /* Must have a full atomic 64-bit read */
299 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
300 #endif
301 }
302
303 static inline bool dma_pte_present(struct dma_pte *pte)
304 {
305 return (pte->val & 3) != 0;
306 }
307
308 static inline bool dma_pte_superpage(struct dma_pte *pte)
309 {
310 return (pte->val & (1 << 7));
311 }
312
313 static inline int first_pte_in_page(struct dma_pte *pte)
314 {
315 return !((unsigned long)pte & ~VTD_PAGE_MASK);
316 }
317
318 /*
319 * This domain is a statically identity mapping domain.
320 * 1. This domain creats a static 1:1 mapping to all usable memory.
321 * 2. It maps to each iommu if successful.
322 * 3. Each iommu mapps to this domain if successful.
323 */
324 static struct dmar_domain *si_domain;
325 static int hw_pass_through = 1;
326
327 /* devices under the same p2p bridge are owned in one domain */
328 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
329
330 /* domain represents a virtual machine, more than one devices
331 * across iommus may be owned in one domain, e.g. kvm guest.
332 */
333 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
334
335 /* si_domain contains mulitple devices */
336 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
337
338 /* define the limit of IOMMUs supported in each domain */
339 #ifdef CONFIG_X86
340 # define IOMMU_UNITS_SUPPORTED MAX_IO_APICS
341 #else
342 # define IOMMU_UNITS_SUPPORTED 64
343 #endif
344
345 struct dmar_domain {
346 int id; /* domain id */
347 int nid; /* node id */
348 DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
349 /* bitmap of iommus this domain uses*/
350
351 struct list_head devices; /* all devices' list */
352 struct iova_domain iovad; /* iova's that belong to this domain */
353
354 struct dma_pte *pgd; /* virtual address */
355 int gaw; /* max guest address width */
356
357 /* adjusted guest address width, 0 is level 2 30-bit */
358 int agaw;
359
360 int flags; /* flags to find out type of domain */
361
362 int iommu_coherency;/* indicate coherency of iommu access */
363 int iommu_snooping; /* indicate snooping control feature*/
364 int iommu_count; /* reference count of iommu */
365 int iommu_superpage;/* Level of superpages supported:
366 0 == 4KiB (no superpages), 1 == 2MiB,
367 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
368 spinlock_t iommu_lock; /* protect iommu set in domain */
369 u64 max_addr; /* maximum mapped address */
370 };
371
372 /* PCI domain-device relationship */
373 struct device_domain_info {
374 struct list_head link; /* link to domain siblings */
375 struct list_head global; /* link to global list */
376 int segment; /* PCI domain */
377 u8 bus; /* PCI bus number */
378 u8 devfn; /* PCI devfn number */
379 struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
380 struct intel_iommu *iommu; /* IOMMU used by this device */
381 struct dmar_domain *domain; /* pointer to domain */
382 };
383
384 struct dmar_rmrr_unit {
385 struct list_head list; /* list of rmrr units */
386 struct acpi_dmar_header *hdr; /* ACPI header */
387 u64 base_address; /* reserved base address*/
388 u64 end_address; /* reserved end address */
389 struct pci_dev __rcu **devices; /* target devices */
390 int devices_cnt; /* target device count */
391 };
392
393 struct dmar_atsr_unit {
394 struct list_head list; /* list of ATSR units */
395 struct acpi_dmar_header *hdr; /* ACPI header */
396 struct pci_dev __rcu **devices; /* target devices */
397 int devices_cnt; /* target device count */
398 u8 include_all:1; /* include all ports */
399 };
400
401 static LIST_HEAD(dmar_atsr_units);
402 static LIST_HEAD(dmar_rmrr_units);
403
404 #define for_each_rmrr_units(rmrr) \
405 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
406
407 static void flush_unmaps_timeout(unsigned long data);
408
409 static DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
410
411 #define HIGH_WATER_MARK 250
412 struct deferred_flush_tables {
413 int next;
414 struct iova *iova[HIGH_WATER_MARK];
415 struct dmar_domain *domain[HIGH_WATER_MARK];
416 };
417
418 static struct deferred_flush_tables *deferred_flush;
419
420 /* bitmap for indexing intel_iommus */
421 static int g_num_of_iommus;
422
423 static DEFINE_SPINLOCK(async_umap_flush_lock);
424 static LIST_HEAD(unmaps_to_do);
425
426 static int timer_on;
427 static long list_size;
428
429 static void domain_exit(struct dmar_domain *domain);
430 static void domain_remove_dev_info(struct dmar_domain *domain);
431 static void domain_remove_one_dev_info(struct dmar_domain *domain,
432 struct pci_dev *pdev);
433 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
434 struct pci_dev *pdev);
435
436 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
437 int dmar_disabled = 0;
438 #else
439 int dmar_disabled = 1;
440 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
441
442 int intel_iommu_enabled = 0;
443 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
444
445 static int dmar_map_gfx = 1;
446 static int dmar_forcedac;
447 static int intel_iommu_strict;
448 static int intel_iommu_superpage = 1;
449
450 int intel_iommu_gfx_mapped;
451 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
452
453 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
454 static DEFINE_SPINLOCK(device_domain_lock);
455 static LIST_HEAD(device_domain_list);
456
457 static struct iommu_ops intel_iommu_ops;
458
459 static int __init intel_iommu_setup(char *str)
460 {
461 if (!str)
462 return -EINVAL;
463 while (*str) {
464 if (!strncmp(str, "on", 2)) {
465 dmar_disabled = 0;
466 printk(KERN_INFO "Intel-IOMMU: enabled\n");
467 } else if (!strncmp(str, "off", 3)) {
468 dmar_disabled = 1;
469 printk(KERN_INFO "Intel-IOMMU: disabled\n");
470 } else if (!strncmp(str, "igfx_off", 8)) {
471 dmar_map_gfx = 0;
472 printk(KERN_INFO
473 "Intel-IOMMU: disable GFX device mapping\n");
474 } else if (!strncmp(str, "forcedac", 8)) {
475 printk(KERN_INFO
476 "Intel-IOMMU: Forcing DAC for PCI devices\n");
477 dmar_forcedac = 1;
478 } else if (!strncmp(str, "strict", 6)) {
479 printk(KERN_INFO
480 "Intel-IOMMU: disable batched IOTLB flush\n");
481 intel_iommu_strict = 1;
482 } else if (!strncmp(str, "sp_off", 6)) {
483 printk(KERN_INFO
484 "Intel-IOMMU: disable supported super page\n");
485 intel_iommu_superpage = 0;
486 }
487
488 str += strcspn(str, ",");
489 while (*str == ',')
490 str++;
491 }
492 return 0;
493 }
494 __setup("intel_iommu=", intel_iommu_setup);
495
496 static struct kmem_cache *iommu_domain_cache;
497 static struct kmem_cache *iommu_devinfo_cache;
498 static struct kmem_cache *iommu_iova_cache;
499
500 static inline void *alloc_pgtable_page(int node)
501 {
502 struct page *page;
503 void *vaddr = NULL;
504
505 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
506 if (page)
507 vaddr = page_address(page);
508 return vaddr;
509 }
510
511 static inline void free_pgtable_page(void *vaddr)
512 {
513 free_page((unsigned long)vaddr);
514 }
515
516 static inline void *alloc_domain_mem(void)
517 {
518 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
519 }
520
521 static void free_domain_mem(void *vaddr)
522 {
523 kmem_cache_free(iommu_domain_cache, vaddr);
524 }
525
526 static inline void * alloc_devinfo_mem(void)
527 {
528 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
529 }
530
531 static inline void free_devinfo_mem(void *vaddr)
532 {
533 kmem_cache_free(iommu_devinfo_cache, vaddr);
534 }
535
536 struct iova *alloc_iova_mem(void)
537 {
538 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
539 }
540
541 void free_iova_mem(struct iova *iova)
542 {
543 kmem_cache_free(iommu_iova_cache, iova);
544 }
545
546
547 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
548 {
549 unsigned long sagaw;
550 int agaw = -1;
551
552 sagaw = cap_sagaw(iommu->cap);
553 for (agaw = width_to_agaw(max_gaw);
554 agaw >= 0; agaw--) {
555 if (test_bit(agaw, &sagaw))
556 break;
557 }
558
559 return agaw;
560 }
561
562 /*
563 * Calculate max SAGAW for each iommu.
564 */
565 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
566 {
567 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
568 }
569
570 /*
571 * calculate agaw for each iommu.
572 * "SAGAW" may be different across iommus, use a default agaw, and
573 * get a supported less agaw for iommus that don't support the default agaw.
574 */
575 int iommu_calculate_agaw(struct intel_iommu *iommu)
576 {
577 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
578 }
579
580 /* This functionin only returns single iommu in a domain */
581 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
582 {
583 int iommu_id;
584
585 /* si_domain and vm domain should not get here. */
586 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
587 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
588
589 iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
590 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
591 return NULL;
592
593 return g_iommus[iommu_id];
594 }
595
596 static void domain_update_iommu_coherency(struct dmar_domain *domain)
597 {
598 int i;
599
600 i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
601
602 domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
603
604 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
605 if (!ecap_coherent(g_iommus[i]->ecap)) {
606 domain->iommu_coherency = 0;
607 break;
608 }
609 }
610 }
611
612 static void domain_update_iommu_snooping(struct dmar_domain *domain)
613 {
614 int i;
615
616 domain->iommu_snooping = 1;
617
618 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
619 if (!ecap_sc_support(g_iommus[i]->ecap)) {
620 domain->iommu_snooping = 0;
621 break;
622 }
623 }
624 }
625
626 static void domain_update_iommu_superpage(struct dmar_domain *domain)
627 {
628 struct dmar_drhd_unit *drhd;
629 struct intel_iommu *iommu = NULL;
630 int mask = 0xf;
631
632 if (!intel_iommu_superpage) {
633 domain->iommu_superpage = 0;
634 return;
635 }
636
637 /* set iommu_superpage to the smallest common denominator */
638 rcu_read_lock();
639 for_each_active_iommu(iommu, drhd) {
640 mask &= cap_super_page_val(iommu->cap);
641 if (!mask) {
642 break;
643 }
644 }
645 rcu_read_unlock();
646
647 domain->iommu_superpage = fls(mask);
648 }
649
650 /* Some capabilities may be different across iommus */
651 static void domain_update_iommu_cap(struct dmar_domain *domain)
652 {
653 domain_update_iommu_coherency(domain);
654 domain_update_iommu_snooping(domain);
655 domain_update_iommu_superpage(domain);
656 }
657
658 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
659 {
660 struct dmar_drhd_unit *drhd = NULL;
661 struct intel_iommu *iommu;
662 struct pci_dev *dev;
663 int i;
664
665 rcu_read_lock();
666 for_each_active_iommu(iommu, drhd) {
667 if (segment != drhd->segment)
668 continue;
669
670 for_each_active_dev_scope(drhd->devices,
671 drhd->devices_cnt, i, dev) {
672 if (dev->bus->number == bus && dev->devfn == devfn)
673 goto out;
674 if (dev->subordinate &&
675 dev->subordinate->number <= bus &&
676 dev->subordinate->busn_res.end >= bus)
677 goto out;
678 }
679
680 if (drhd->include_all)
681 goto out;
682 }
683 iommu = NULL;
684 out:
685 rcu_read_unlock();
686
687 return iommu;
688 }
689
690 static void domain_flush_cache(struct dmar_domain *domain,
691 void *addr, int size)
692 {
693 if (!domain->iommu_coherency)
694 clflush_cache_range(addr, size);
695 }
696
697 /* Gets context entry for a given bus and devfn */
698 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
699 u8 bus, u8 devfn)
700 {
701 struct root_entry *root;
702 struct context_entry *context;
703 unsigned long phy_addr;
704 unsigned long flags;
705
706 spin_lock_irqsave(&iommu->lock, flags);
707 root = &iommu->root_entry[bus];
708 context = get_context_addr_from_root(root);
709 if (!context) {
710 context = (struct context_entry *)
711 alloc_pgtable_page(iommu->node);
712 if (!context) {
713 spin_unlock_irqrestore(&iommu->lock, flags);
714 return NULL;
715 }
716 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
717 phy_addr = virt_to_phys((void *)context);
718 set_root_value(root, phy_addr);
719 set_root_present(root);
720 __iommu_flush_cache(iommu, root, sizeof(*root));
721 }
722 spin_unlock_irqrestore(&iommu->lock, flags);
723 return &context[devfn];
724 }
725
726 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
727 {
728 struct root_entry *root;
729 struct context_entry *context;
730 int ret;
731 unsigned long flags;
732
733 spin_lock_irqsave(&iommu->lock, flags);
734 root = &iommu->root_entry[bus];
735 context = get_context_addr_from_root(root);
736 if (!context) {
737 ret = 0;
738 goto out;
739 }
740 ret = context_present(&context[devfn]);
741 out:
742 spin_unlock_irqrestore(&iommu->lock, flags);
743 return ret;
744 }
745
746 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
747 {
748 struct root_entry *root;
749 struct context_entry *context;
750 unsigned long flags;
751
752 spin_lock_irqsave(&iommu->lock, flags);
753 root = &iommu->root_entry[bus];
754 context = get_context_addr_from_root(root);
755 if (context) {
756 context_clear_entry(&context[devfn]);
757 __iommu_flush_cache(iommu, &context[devfn], \
758 sizeof(*context));
759 }
760 spin_unlock_irqrestore(&iommu->lock, flags);
761 }
762
763 static void free_context_table(struct intel_iommu *iommu)
764 {
765 struct root_entry *root;
766 int i;
767 unsigned long flags;
768 struct context_entry *context;
769
770 spin_lock_irqsave(&iommu->lock, flags);
771 if (!iommu->root_entry) {
772 goto out;
773 }
774 for (i = 0; i < ROOT_ENTRY_NR; i++) {
775 root = &iommu->root_entry[i];
776 context = get_context_addr_from_root(root);
777 if (context)
778 free_pgtable_page(context);
779 }
780 free_pgtable_page(iommu->root_entry);
781 iommu->root_entry = NULL;
782 out:
783 spin_unlock_irqrestore(&iommu->lock, flags);
784 }
785
786 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
787 unsigned long pfn, int *target_level)
788 {
789 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
790 struct dma_pte *parent, *pte = NULL;
791 int level = agaw_to_level(domain->agaw);
792 int offset;
793
794 BUG_ON(!domain->pgd);
795
796 if (addr_width < BITS_PER_LONG && pfn >> addr_width)
797 /* Address beyond IOMMU's addressing capabilities. */
798 return NULL;
799
800 parent = domain->pgd;
801
802 while (1) {
803 void *tmp_page;
804
805 offset = pfn_level_offset(pfn, level);
806 pte = &parent[offset];
807 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
808 break;
809 if (level == *target_level)
810 break;
811
812 if (!dma_pte_present(pte)) {
813 uint64_t pteval;
814
815 tmp_page = alloc_pgtable_page(domain->nid);
816
817 if (!tmp_page)
818 return NULL;
819
820 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
821 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
822 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
823 /* Someone else set it while we were thinking; use theirs. */
824 free_pgtable_page(tmp_page);
825 } else {
826 dma_pte_addr(pte);
827 domain_flush_cache(domain, pte, sizeof(*pte));
828 }
829 }
830 if (level == 1)
831 break;
832
833 parent = phys_to_virt(dma_pte_addr(pte));
834 level--;
835 }
836
837 if (!*target_level)
838 *target_level = level;
839
840 return pte;
841 }
842
843
844 /* return address's pte at specific level */
845 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
846 unsigned long pfn,
847 int level, int *large_page)
848 {
849 struct dma_pte *parent, *pte = NULL;
850 int total = agaw_to_level(domain->agaw);
851 int offset;
852
853 parent = domain->pgd;
854 while (level <= total) {
855 offset = pfn_level_offset(pfn, total);
856 pte = &parent[offset];
857 if (level == total)
858 return pte;
859
860 if (!dma_pte_present(pte)) {
861 *large_page = total;
862 break;
863 }
864
865 if (pte->val & DMA_PTE_LARGE_PAGE) {
866 *large_page = total;
867 return pte;
868 }
869
870 parent = phys_to_virt(dma_pte_addr(pte));
871 total--;
872 }
873 return NULL;
874 }
875
876 /* clear last level pte, a tlb flush should be followed */
877 static void dma_pte_clear_range(struct dmar_domain *domain,
878 unsigned long start_pfn,
879 unsigned long last_pfn)
880 {
881 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
882 unsigned int large_page = 1;
883 struct dma_pte *first_pte, *pte;
884
885 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
886 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
887 BUG_ON(start_pfn > last_pfn);
888
889 /* we don't need lock here; nobody else touches the iova range */
890 do {
891 large_page = 1;
892 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
893 if (!pte) {
894 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
895 continue;
896 }
897 do {
898 dma_clear_pte(pte);
899 start_pfn += lvl_to_nr_pages(large_page);
900 pte++;
901 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
902
903 domain_flush_cache(domain, first_pte,
904 (void *)pte - (void *)first_pte);
905
906 } while (start_pfn && start_pfn <= last_pfn);
907 }
908
909 static void dma_pte_free_level(struct dmar_domain *domain, int level,
910 struct dma_pte *pte, unsigned long pfn,
911 unsigned long start_pfn, unsigned long last_pfn)
912 {
913 pfn = max(start_pfn, pfn);
914 pte = &pte[pfn_level_offset(pfn, level)];
915
916 do {
917 unsigned long level_pfn;
918 struct dma_pte *level_pte;
919
920 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
921 goto next;
922
923 level_pfn = pfn & level_mask(level - 1);
924 level_pte = phys_to_virt(dma_pte_addr(pte));
925
926 if (level > 2)
927 dma_pte_free_level(domain, level - 1, level_pte,
928 level_pfn, start_pfn, last_pfn);
929
930 /* If range covers entire pagetable, free it */
931 if (!(start_pfn > level_pfn ||
932 last_pfn < level_pfn + level_size(level) - 1)) {
933 dma_clear_pte(pte);
934 domain_flush_cache(domain, pte, sizeof(*pte));
935 free_pgtable_page(level_pte);
936 }
937 next:
938 pfn += level_size(level);
939 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
940 }
941
942 /* free page table pages. last level pte should already be cleared */
943 static void dma_pte_free_pagetable(struct dmar_domain *domain,
944 unsigned long start_pfn,
945 unsigned long last_pfn)
946 {
947 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
948
949 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
950 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
951 BUG_ON(start_pfn > last_pfn);
952
953 /* We don't need lock here; nobody else touches the iova range */
954 dma_pte_free_level(domain, agaw_to_level(domain->agaw),
955 domain->pgd, 0, start_pfn, last_pfn);
956
957 /* free pgd */
958 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
959 free_pgtable_page(domain->pgd);
960 domain->pgd = NULL;
961 }
962 }
963
964 /* iommu handling */
965 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
966 {
967 struct root_entry *root;
968 unsigned long flags;
969
970 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
971 if (!root)
972 return -ENOMEM;
973
974 __iommu_flush_cache(iommu, root, ROOT_SIZE);
975
976 spin_lock_irqsave(&iommu->lock, flags);
977 iommu->root_entry = root;
978 spin_unlock_irqrestore(&iommu->lock, flags);
979
980 return 0;
981 }
982
983 static void iommu_set_root_entry(struct intel_iommu *iommu)
984 {
985 void *addr;
986 u32 sts;
987 unsigned long flag;
988
989 addr = iommu->root_entry;
990
991 raw_spin_lock_irqsave(&iommu->register_lock, flag);
992 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
993
994 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
995
996 /* Make sure hardware complete it */
997 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
998 readl, (sts & DMA_GSTS_RTPS), sts);
999
1000 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1001 }
1002
1003 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1004 {
1005 u32 val;
1006 unsigned long flag;
1007
1008 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1009 return;
1010
1011 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1012 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1013
1014 /* Make sure hardware complete it */
1015 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1016 readl, (!(val & DMA_GSTS_WBFS)), val);
1017
1018 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1019 }
1020
1021 /* return value determine if we need a write buffer flush */
1022 static void __iommu_flush_context(struct intel_iommu *iommu,
1023 u16 did, u16 source_id, u8 function_mask,
1024 u64 type)
1025 {
1026 u64 val = 0;
1027 unsigned long flag;
1028
1029 switch (type) {
1030 case DMA_CCMD_GLOBAL_INVL:
1031 val = DMA_CCMD_GLOBAL_INVL;
1032 break;
1033 case DMA_CCMD_DOMAIN_INVL:
1034 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1035 break;
1036 case DMA_CCMD_DEVICE_INVL:
1037 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1038 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1039 break;
1040 default:
1041 BUG();
1042 }
1043 val |= DMA_CCMD_ICC;
1044
1045 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1046 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1047
1048 /* Make sure hardware complete it */
1049 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1050 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1051
1052 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1053 }
1054
1055 /* return value determine if we need a write buffer flush */
1056 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1057 u64 addr, unsigned int size_order, u64 type)
1058 {
1059 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1060 u64 val = 0, val_iva = 0;
1061 unsigned long flag;
1062
1063 switch (type) {
1064 case DMA_TLB_GLOBAL_FLUSH:
1065 /* global flush doesn't need set IVA_REG */
1066 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1067 break;
1068 case DMA_TLB_DSI_FLUSH:
1069 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1070 break;
1071 case DMA_TLB_PSI_FLUSH:
1072 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1073 /* Note: always flush non-leaf currently */
1074 val_iva = size_order | addr;
1075 break;
1076 default:
1077 BUG();
1078 }
1079 /* Note: set drain read/write */
1080 #if 0
1081 /*
1082 * This is probably to be super secure.. Looks like we can
1083 * ignore it without any impact.
1084 */
1085 if (cap_read_drain(iommu->cap))
1086 val |= DMA_TLB_READ_DRAIN;
1087 #endif
1088 if (cap_write_drain(iommu->cap))
1089 val |= DMA_TLB_WRITE_DRAIN;
1090
1091 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1092 /* Note: Only uses first TLB reg currently */
1093 if (val_iva)
1094 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1095 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1096
1097 /* Make sure hardware complete it */
1098 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1099 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1100
1101 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1102
1103 /* check IOTLB invalidation granularity */
1104 if (DMA_TLB_IAIG(val) == 0)
1105 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1106 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1107 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1108 (unsigned long long)DMA_TLB_IIRG(type),
1109 (unsigned long long)DMA_TLB_IAIG(val));
1110 }
1111
1112 static struct device_domain_info *iommu_support_dev_iotlb(
1113 struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1114 {
1115 int found = 0;
1116 unsigned long flags;
1117 struct device_domain_info *info;
1118 struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1119
1120 if (!ecap_dev_iotlb_support(iommu->ecap))
1121 return NULL;
1122
1123 if (!iommu->qi)
1124 return NULL;
1125
1126 spin_lock_irqsave(&device_domain_lock, flags);
1127 list_for_each_entry(info, &domain->devices, link)
1128 if (info->bus == bus && info->devfn == devfn) {
1129 found = 1;
1130 break;
1131 }
1132 spin_unlock_irqrestore(&device_domain_lock, flags);
1133
1134 if (!found || !info->dev)
1135 return NULL;
1136
1137 if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1138 return NULL;
1139
1140 if (!dmar_find_matched_atsr_unit(info->dev))
1141 return NULL;
1142
1143 info->iommu = iommu;
1144
1145 return info;
1146 }
1147
1148 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1149 {
1150 if (!info)
1151 return;
1152
1153 pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1154 }
1155
1156 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1157 {
1158 if (!info->dev || !pci_ats_enabled(info->dev))
1159 return;
1160
1161 pci_disable_ats(info->dev);
1162 }
1163
1164 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1165 u64 addr, unsigned mask)
1166 {
1167 u16 sid, qdep;
1168 unsigned long flags;
1169 struct device_domain_info *info;
1170
1171 spin_lock_irqsave(&device_domain_lock, flags);
1172 list_for_each_entry(info, &domain->devices, link) {
1173 if (!info->dev || !pci_ats_enabled(info->dev))
1174 continue;
1175
1176 sid = info->bus << 8 | info->devfn;
1177 qdep = pci_ats_queue_depth(info->dev);
1178 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1179 }
1180 spin_unlock_irqrestore(&device_domain_lock, flags);
1181 }
1182
1183 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1184 unsigned long pfn, unsigned int pages, int map)
1185 {
1186 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1187 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1188
1189 BUG_ON(pages == 0);
1190
1191 /*
1192 * Fallback to domain selective flush if no PSI support or the size is
1193 * too big.
1194 * PSI requires page size to be 2 ^ x, and the base address is naturally
1195 * aligned to the size
1196 */
1197 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1198 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1199 DMA_TLB_DSI_FLUSH);
1200 else
1201 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1202 DMA_TLB_PSI_FLUSH);
1203
1204 /*
1205 * In caching mode, changes of pages from non-present to present require
1206 * flush. However, device IOTLB doesn't need to be flushed in this case.
1207 */
1208 if (!cap_caching_mode(iommu->cap) || !map)
1209 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1210 }
1211
1212 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1213 {
1214 u32 pmen;
1215 unsigned long flags;
1216
1217 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1218 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1219 pmen &= ~DMA_PMEN_EPM;
1220 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1221
1222 /* wait for the protected region status bit to clear */
1223 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1224 readl, !(pmen & DMA_PMEN_PRS), pmen);
1225
1226 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1227 }
1228
1229 static int iommu_enable_translation(struct intel_iommu *iommu)
1230 {
1231 u32 sts;
1232 unsigned long flags;
1233
1234 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1235 iommu->gcmd |= DMA_GCMD_TE;
1236 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1237
1238 /* Make sure hardware complete it */
1239 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1240 readl, (sts & DMA_GSTS_TES), sts);
1241
1242 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1243 return 0;
1244 }
1245
1246 static int iommu_disable_translation(struct intel_iommu *iommu)
1247 {
1248 u32 sts;
1249 unsigned long flag;
1250
1251 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1252 iommu->gcmd &= ~DMA_GCMD_TE;
1253 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1254
1255 /* Make sure hardware complete it */
1256 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1257 readl, (!(sts & DMA_GSTS_TES)), sts);
1258
1259 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1260 return 0;
1261 }
1262
1263
1264 static int iommu_init_domains(struct intel_iommu *iommu)
1265 {
1266 unsigned long ndomains;
1267 unsigned long nlongs;
1268
1269 ndomains = cap_ndoms(iommu->cap);
1270 pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1271 iommu->seq_id, ndomains);
1272 nlongs = BITS_TO_LONGS(ndomains);
1273
1274 spin_lock_init(&iommu->lock);
1275
1276 /* TBD: there might be 64K domains,
1277 * consider other allocation for future chip
1278 */
1279 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1280 if (!iommu->domain_ids) {
1281 pr_err("IOMMU%d: allocating domain id array failed\n",
1282 iommu->seq_id);
1283 return -ENOMEM;
1284 }
1285 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1286 GFP_KERNEL);
1287 if (!iommu->domains) {
1288 pr_err("IOMMU%d: allocating domain array failed\n",
1289 iommu->seq_id);
1290 kfree(iommu->domain_ids);
1291 iommu->domain_ids = NULL;
1292 return -ENOMEM;
1293 }
1294
1295 /*
1296 * if Caching mode is set, then invalid translations are tagged
1297 * with domainid 0. Hence we need to pre-allocate it.
1298 */
1299 if (cap_caching_mode(iommu->cap))
1300 set_bit(0, iommu->domain_ids);
1301 return 0;
1302 }
1303
1304 static void free_dmar_iommu(struct intel_iommu *iommu)
1305 {
1306 struct dmar_domain *domain;
1307 int i, count;
1308 unsigned long flags;
1309
1310 if ((iommu->domains) && (iommu->domain_ids)) {
1311 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1312 /*
1313 * Domain id 0 is reserved for invalid translation
1314 * if hardware supports caching mode.
1315 */
1316 if (cap_caching_mode(iommu->cap) && i == 0)
1317 continue;
1318
1319 domain = iommu->domains[i];
1320 clear_bit(i, iommu->domain_ids);
1321
1322 spin_lock_irqsave(&domain->iommu_lock, flags);
1323 count = --domain->iommu_count;
1324 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1325 if (count == 0)
1326 domain_exit(domain);
1327 }
1328 }
1329
1330 if (iommu->gcmd & DMA_GCMD_TE)
1331 iommu_disable_translation(iommu);
1332
1333 kfree(iommu->domains);
1334 kfree(iommu->domain_ids);
1335 iommu->domains = NULL;
1336 iommu->domain_ids = NULL;
1337
1338 g_iommus[iommu->seq_id] = NULL;
1339
1340 /* free context mapping */
1341 free_context_table(iommu);
1342 }
1343
1344 static struct dmar_domain *alloc_domain(bool vm)
1345 {
1346 /* domain id for virtual machine, it won't be set in context */
1347 static atomic_t vm_domid = ATOMIC_INIT(0);
1348 struct dmar_domain *domain;
1349
1350 domain = alloc_domain_mem();
1351 if (!domain)
1352 return NULL;
1353
1354 domain->nid = -1;
1355 domain->iommu_count = 0;
1356 memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1357 domain->flags = 0;
1358 spin_lock_init(&domain->iommu_lock);
1359 INIT_LIST_HEAD(&domain->devices);
1360 if (vm) {
1361 domain->id = atomic_inc_return(&vm_domid);
1362 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
1363 }
1364
1365 return domain;
1366 }
1367
1368 static int iommu_attach_domain(struct dmar_domain *domain,
1369 struct intel_iommu *iommu)
1370 {
1371 int num;
1372 unsigned long ndomains;
1373 unsigned long flags;
1374
1375 ndomains = cap_ndoms(iommu->cap);
1376
1377 spin_lock_irqsave(&iommu->lock, flags);
1378
1379 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1380 if (num >= ndomains) {
1381 spin_unlock_irqrestore(&iommu->lock, flags);
1382 printk(KERN_ERR "IOMMU: no free domain ids\n");
1383 return -ENOMEM;
1384 }
1385
1386 domain->id = num;
1387 domain->iommu_count++;
1388 set_bit(num, iommu->domain_ids);
1389 set_bit(iommu->seq_id, domain->iommu_bmp);
1390 iommu->domains[num] = domain;
1391 spin_unlock_irqrestore(&iommu->lock, flags);
1392
1393 return 0;
1394 }
1395
1396 static void iommu_detach_domain(struct dmar_domain *domain,
1397 struct intel_iommu *iommu)
1398 {
1399 unsigned long flags;
1400 int num, ndomains;
1401
1402 spin_lock_irqsave(&iommu->lock, flags);
1403 ndomains = cap_ndoms(iommu->cap);
1404 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1405 if (iommu->domains[num] == domain) {
1406 clear_bit(num, iommu->domain_ids);
1407 iommu->domains[num] = NULL;
1408 break;
1409 }
1410 }
1411 spin_unlock_irqrestore(&iommu->lock, flags);
1412 }
1413
1414 static struct iova_domain reserved_iova_list;
1415 static struct lock_class_key reserved_rbtree_key;
1416
1417 static int dmar_init_reserved_ranges(void)
1418 {
1419 struct pci_dev *pdev = NULL;
1420 struct iova *iova;
1421 int i;
1422
1423 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1424
1425 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1426 &reserved_rbtree_key);
1427
1428 /* IOAPIC ranges shouldn't be accessed by DMA */
1429 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1430 IOVA_PFN(IOAPIC_RANGE_END));
1431 if (!iova) {
1432 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1433 return -ENODEV;
1434 }
1435
1436 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1437 for_each_pci_dev(pdev) {
1438 struct resource *r;
1439
1440 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1441 r = &pdev->resource[i];
1442 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1443 continue;
1444 iova = reserve_iova(&reserved_iova_list,
1445 IOVA_PFN(r->start),
1446 IOVA_PFN(r->end));
1447 if (!iova) {
1448 printk(KERN_ERR "Reserve iova failed\n");
1449 return -ENODEV;
1450 }
1451 }
1452 }
1453 return 0;
1454 }
1455
1456 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1457 {
1458 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1459 }
1460
1461 static inline int guestwidth_to_adjustwidth(int gaw)
1462 {
1463 int agaw;
1464 int r = (gaw - 12) % 9;
1465
1466 if (r == 0)
1467 agaw = gaw;
1468 else
1469 agaw = gaw + 9 - r;
1470 if (agaw > 64)
1471 agaw = 64;
1472 return agaw;
1473 }
1474
1475 static int domain_init(struct dmar_domain *domain, int guest_width)
1476 {
1477 struct intel_iommu *iommu;
1478 int adjust_width, agaw;
1479 unsigned long sagaw;
1480
1481 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1482 domain_reserve_special_ranges(domain);
1483
1484 /* calculate AGAW */
1485 iommu = domain_get_iommu(domain);
1486 if (guest_width > cap_mgaw(iommu->cap))
1487 guest_width = cap_mgaw(iommu->cap);
1488 domain->gaw = guest_width;
1489 adjust_width = guestwidth_to_adjustwidth(guest_width);
1490 agaw = width_to_agaw(adjust_width);
1491 sagaw = cap_sagaw(iommu->cap);
1492 if (!test_bit(agaw, &sagaw)) {
1493 /* hardware doesn't support it, choose a bigger one */
1494 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1495 agaw = find_next_bit(&sagaw, 5, agaw);
1496 if (agaw >= 5)
1497 return -ENODEV;
1498 }
1499 domain->agaw = agaw;
1500
1501 if (ecap_coherent(iommu->ecap))
1502 domain->iommu_coherency = 1;
1503 else
1504 domain->iommu_coherency = 0;
1505
1506 if (ecap_sc_support(iommu->ecap))
1507 domain->iommu_snooping = 1;
1508 else
1509 domain->iommu_snooping = 0;
1510
1511 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1512 domain->nid = iommu->node;
1513
1514 /* always allocate the top pgd */
1515 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1516 if (!domain->pgd)
1517 return -ENOMEM;
1518 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1519 return 0;
1520 }
1521
1522 static void domain_exit(struct dmar_domain *domain)
1523 {
1524 struct dmar_drhd_unit *drhd;
1525 struct intel_iommu *iommu;
1526
1527 /* Domain 0 is reserved, so dont process it */
1528 if (!domain)
1529 return;
1530
1531 /* Flush any lazy unmaps that may reference this domain */
1532 if (!intel_iommu_strict)
1533 flush_unmaps_timeout(0);
1534
1535 /* remove associated devices */
1536 domain_remove_dev_info(domain);
1537
1538 /* destroy iovas */
1539 put_iova_domain(&domain->iovad);
1540
1541 /* clear ptes */
1542 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1543
1544 /* free page tables */
1545 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1546
1547 /* clear attached or cached domains */
1548 rcu_read_lock();
1549 for_each_active_iommu(iommu, drhd)
1550 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1551 test_bit(iommu->seq_id, domain->iommu_bmp))
1552 iommu_detach_domain(domain, iommu);
1553 rcu_read_unlock();
1554
1555 free_domain_mem(domain);
1556 }
1557
1558 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1559 u8 bus, u8 devfn, int translation)
1560 {
1561 struct context_entry *context;
1562 unsigned long flags;
1563 struct intel_iommu *iommu;
1564 struct dma_pte *pgd;
1565 unsigned long num;
1566 unsigned long ndomains;
1567 int id;
1568 int agaw;
1569 struct device_domain_info *info = NULL;
1570
1571 pr_debug("Set context mapping for %02x:%02x.%d\n",
1572 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1573
1574 BUG_ON(!domain->pgd);
1575 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1576 translation != CONTEXT_TT_MULTI_LEVEL);
1577
1578 iommu = device_to_iommu(segment, bus, devfn);
1579 if (!iommu)
1580 return -ENODEV;
1581
1582 context = device_to_context_entry(iommu, bus, devfn);
1583 if (!context)
1584 return -ENOMEM;
1585 spin_lock_irqsave(&iommu->lock, flags);
1586 if (context_present(context)) {
1587 spin_unlock_irqrestore(&iommu->lock, flags);
1588 return 0;
1589 }
1590
1591 id = domain->id;
1592 pgd = domain->pgd;
1593
1594 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1595 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1596 int found = 0;
1597
1598 /* find an available domain id for this device in iommu */
1599 ndomains = cap_ndoms(iommu->cap);
1600 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1601 if (iommu->domains[num] == domain) {
1602 id = num;
1603 found = 1;
1604 break;
1605 }
1606 }
1607
1608 if (found == 0) {
1609 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1610 if (num >= ndomains) {
1611 spin_unlock_irqrestore(&iommu->lock, flags);
1612 printk(KERN_ERR "IOMMU: no free domain ids\n");
1613 return -EFAULT;
1614 }
1615
1616 set_bit(num, iommu->domain_ids);
1617 iommu->domains[num] = domain;
1618 id = num;
1619 }
1620
1621 /* Skip top levels of page tables for
1622 * iommu which has less agaw than default.
1623 * Unnecessary for PT mode.
1624 */
1625 if (translation != CONTEXT_TT_PASS_THROUGH) {
1626 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1627 pgd = phys_to_virt(dma_pte_addr(pgd));
1628 if (!dma_pte_present(pgd)) {
1629 spin_unlock_irqrestore(&iommu->lock, flags);
1630 return -ENOMEM;
1631 }
1632 }
1633 }
1634 }
1635
1636 context_set_domain_id(context, id);
1637
1638 if (translation != CONTEXT_TT_PASS_THROUGH) {
1639 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1640 translation = info ? CONTEXT_TT_DEV_IOTLB :
1641 CONTEXT_TT_MULTI_LEVEL;
1642 }
1643 /*
1644 * In pass through mode, AW must be programmed to indicate the largest
1645 * AGAW value supported by hardware. And ASR is ignored by hardware.
1646 */
1647 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1648 context_set_address_width(context, iommu->msagaw);
1649 else {
1650 context_set_address_root(context, virt_to_phys(pgd));
1651 context_set_address_width(context, iommu->agaw);
1652 }
1653
1654 context_set_translation_type(context, translation);
1655 context_set_fault_enable(context);
1656 context_set_present(context);
1657 domain_flush_cache(domain, context, sizeof(*context));
1658
1659 /*
1660 * It's a non-present to present mapping. If hardware doesn't cache
1661 * non-present entry we only need to flush the write-buffer. If the
1662 * _does_ cache non-present entries, then it does so in the special
1663 * domain #0, which we have to flush:
1664 */
1665 if (cap_caching_mode(iommu->cap)) {
1666 iommu->flush.flush_context(iommu, 0,
1667 (((u16)bus) << 8) | devfn,
1668 DMA_CCMD_MASK_NOBIT,
1669 DMA_CCMD_DEVICE_INVL);
1670 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1671 } else {
1672 iommu_flush_write_buffer(iommu);
1673 }
1674 iommu_enable_dev_iotlb(info);
1675 spin_unlock_irqrestore(&iommu->lock, flags);
1676
1677 spin_lock_irqsave(&domain->iommu_lock, flags);
1678 if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1679 domain->iommu_count++;
1680 if (domain->iommu_count == 1)
1681 domain->nid = iommu->node;
1682 domain_update_iommu_cap(domain);
1683 }
1684 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1685 return 0;
1686 }
1687
1688 static int
1689 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1690 int translation)
1691 {
1692 int ret;
1693 struct pci_dev *tmp, *parent;
1694
1695 ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1696 pdev->bus->number, pdev->devfn,
1697 translation);
1698 if (ret)
1699 return ret;
1700
1701 /* dependent device mapping */
1702 tmp = pci_find_upstream_pcie_bridge(pdev);
1703 if (!tmp)
1704 return 0;
1705 /* Secondary interface's bus number and devfn 0 */
1706 parent = pdev->bus->self;
1707 while (parent != tmp) {
1708 ret = domain_context_mapping_one(domain,
1709 pci_domain_nr(parent->bus),
1710 parent->bus->number,
1711 parent->devfn, translation);
1712 if (ret)
1713 return ret;
1714 parent = parent->bus->self;
1715 }
1716 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1717 return domain_context_mapping_one(domain,
1718 pci_domain_nr(tmp->subordinate),
1719 tmp->subordinate->number, 0,
1720 translation);
1721 else /* this is a legacy PCI bridge */
1722 return domain_context_mapping_one(domain,
1723 pci_domain_nr(tmp->bus),
1724 tmp->bus->number,
1725 tmp->devfn,
1726 translation);
1727 }
1728
1729 static int domain_context_mapped(struct pci_dev *pdev)
1730 {
1731 int ret;
1732 struct pci_dev *tmp, *parent;
1733 struct intel_iommu *iommu;
1734
1735 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1736 pdev->devfn);
1737 if (!iommu)
1738 return -ENODEV;
1739
1740 ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1741 if (!ret)
1742 return ret;
1743 /* dependent device mapping */
1744 tmp = pci_find_upstream_pcie_bridge(pdev);
1745 if (!tmp)
1746 return ret;
1747 /* Secondary interface's bus number and devfn 0 */
1748 parent = pdev->bus->self;
1749 while (parent != tmp) {
1750 ret = device_context_mapped(iommu, parent->bus->number,
1751 parent->devfn);
1752 if (!ret)
1753 return ret;
1754 parent = parent->bus->self;
1755 }
1756 if (pci_is_pcie(tmp))
1757 return device_context_mapped(iommu, tmp->subordinate->number,
1758 0);
1759 else
1760 return device_context_mapped(iommu, tmp->bus->number,
1761 tmp->devfn);
1762 }
1763
1764 /* Returns a number of VTD pages, but aligned to MM page size */
1765 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1766 size_t size)
1767 {
1768 host_addr &= ~PAGE_MASK;
1769 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1770 }
1771
1772 /* Return largest possible superpage level for a given mapping */
1773 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1774 unsigned long iov_pfn,
1775 unsigned long phy_pfn,
1776 unsigned long pages)
1777 {
1778 int support, level = 1;
1779 unsigned long pfnmerge;
1780
1781 support = domain->iommu_superpage;
1782
1783 /* To use a large page, the virtual *and* physical addresses
1784 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1785 of them will mean we have to use smaller pages. So just
1786 merge them and check both at once. */
1787 pfnmerge = iov_pfn | phy_pfn;
1788
1789 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1790 pages >>= VTD_STRIDE_SHIFT;
1791 if (!pages)
1792 break;
1793 pfnmerge >>= VTD_STRIDE_SHIFT;
1794 level++;
1795 support--;
1796 }
1797 return level;
1798 }
1799
1800 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1801 struct scatterlist *sg, unsigned long phys_pfn,
1802 unsigned long nr_pages, int prot)
1803 {
1804 struct dma_pte *first_pte = NULL, *pte = NULL;
1805 phys_addr_t uninitialized_var(pteval);
1806 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1807 unsigned long sg_res;
1808 unsigned int largepage_lvl = 0;
1809 unsigned long lvl_pages = 0;
1810
1811 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1812
1813 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1814 return -EINVAL;
1815
1816 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1817
1818 if (sg)
1819 sg_res = 0;
1820 else {
1821 sg_res = nr_pages + 1;
1822 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1823 }
1824
1825 while (nr_pages > 0) {
1826 uint64_t tmp;
1827
1828 if (!sg_res) {
1829 sg_res = aligned_nrpages(sg->offset, sg->length);
1830 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1831 sg->dma_length = sg->length;
1832 pteval = page_to_phys(sg_page(sg)) | prot;
1833 phys_pfn = pteval >> VTD_PAGE_SHIFT;
1834 }
1835
1836 if (!pte) {
1837 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1838
1839 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
1840 if (!pte)
1841 return -ENOMEM;
1842 /* It is large page*/
1843 if (largepage_lvl > 1) {
1844 pteval |= DMA_PTE_LARGE_PAGE;
1845 /* Ensure that old small page tables are removed to make room
1846 for superpage, if they exist. */
1847 dma_pte_clear_range(domain, iov_pfn,
1848 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1849 dma_pte_free_pagetable(domain, iov_pfn,
1850 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1851 } else {
1852 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1853 }
1854
1855 }
1856 /* We don't need lock here, nobody else
1857 * touches the iova range
1858 */
1859 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1860 if (tmp) {
1861 static int dumps = 5;
1862 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1863 iov_pfn, tmp, (unsigned long long)pteval);
1864 if (dumps) {
1865 dumps--;
1866 debug_dma_dump_mappings(NULL);
1867 }
1868 WARN_ON(1);
1869 }
1870
1871 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1872
1873 BUG_ON(nr_pages < lvl_pages);
1874 BUG_ON(sg_res < lvl_pages);
1875
1876 nr_pages -= lvl_pages;
1877 iov_pfn += lvl_pages;
1878 phys_pfn += lvl_pages;
1879 pteval += lvl_pages * VTD_PAGE_SIZE;
1880 sg_res -= lvl_pages;
1881
1882 /* If the next PTE would be the first in a new page, then we
1883 need to flush the cache on the entries we've just written.
1884 And then we'll need to recalculate 'pte', so clear it and
1885 let it get set again in the if (!pte) block above.
1886
1887 If we're done (!nr_pages) we need to flush the cache too.
1888
1889 Also if we've been setting superpages, we may need to
1890 recalculate 'pte' and switch back to smaller pages for the
1891 end of the mapping, if the trailing size is not enough to
1892 use another superpage (i.e. sg_res < lvl_pages). */
1893 pte++;
1894 if (!nr_pages || first_pte_in_page(pte) ||
1895 (largepage_lvl > 1 && sg_res < lvl_pages)) {
1896 domain_flush_cache(domain, first_pte,
1897 (void *)pte - (void *)first_pte);
1898 pte = NULL;
1899 }
1900
1901 if (!sg_res && nr_pages)
1902 sg = sg_next(sg);
1903 }
1904 return 0;
1905 }
1906
1907 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1908 struct scatterlist *sg, unsigned long nr_pages,
1909 int prot)
1910 {
1911 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1912 }
1913
1914 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1915 unsigned long phys_pfn, unsigned long nr_pages,
1916 int prot)
1917 {
1918 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1919 }
1920
1921 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1922 {
1923 if (!iommu)
1924 return;
1925
1926 clear_context_table(iommu, bus, devfn);
1927 iommu->flush.flush_context(iommu, 0, 0, 0,
1928 DMA_CCMD_GLOBAL_INVL);
1929 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1930 }
1931
1932 static inline void unlink_domain_info(struct device_domain_info *info)
1933 {
1934 assert_spin_locked(&device_domain_lock);
1935 list_del(&info->link);
1936 list_del(&info->global);
1937 if (info->dev)
1938 info->dev->dev.archdata.iommu = NULL;
1939 }
1940
1941 static void domain_remove_dev_info(struct dmar_domain *domain)
1942 {
1943 struct device_domain_info *info;
1944 unsigned long flags, flags2;
1945 struct intel_iommu *iommu;
1946
1947 spin_lock_irqsave(&device_domain_lock, flags);
1948 while (!list_empty(&domain->devices)) {
1949 info = list_entry(domain->devices.next,
1950 struct device_domain_info, link);
1951 unlink_domain_info(info);
1952 spin_unlock_irqrestore(&device_domain_lock, flags);
1953
1954 iommu_disable_dev_iotlb(info);
1955 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1956 iommu_detach_dev(iommu, info->bus, info->devfn);
1957
1958 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1959 iommu_detach_dependent_devices(iommu, info->dev);
1960 /* clear this iommu in iommu_bmp, update iommu count
1961 * and capabilities
1962 */
1963 spin_lock_irqsave(&domain->iommu_lock, flags2);
1964 if (test_and_clear_bit(iommu->seq_id,
1965 domain->iommu_bmp)) {
1966 domain->iommu_count--;
1967 domain_update_iommu_cap(domain);
1968 }
1969 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
1970 }
1971
1972 free_devinfo_mem(info);
1973 spin_lock_irqsave(&device_domain_lock, flags);
1974 }
1975 spin_unlock_irqrestore(&device_domain_lock, flags);
1976 }
1977
1978 /*
1979 * find_domain
1980 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1981 */
1982 static struct dmar_domain *
1983 find_domain(struct pci_dev *pdev)
1984 {
1985 struct device_domain_info *info;
1986
1987 /* No lock here, assumes no domain exit in normal case */
1988 info = pdev->dev.archdata.iommu;
1989 if (info)
1990 return info->domain;
1991 return NULL;
1992 }
1993
1994 static inline struct dmar_domain *
1995 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
1996 {
1997 struct device_domain_info *info;
1998
1999 list_for_each_entry(info, &device_domain_list, global)
2000 if (info->segment == segment && info->bus == bus &&
2001 info->devfn == devfn)
2002 return info->domain;
2003
2004 return NULL;
2005 }
2006
2007 static int dmar_insert_dev_info(int segment, int bus, int devfn,
2008 struct pci_dev *dev, struct dmar_domain **domp)
2009 {
2010 struct dmar_domain *found, *domain = *domp;
2011 struct device_domain_info *info;
2012 unsigned long flags;
2013
2014 info = alloc_devinfo_mem();
2015 if (!info)
2016 return -ENOMEM;
2017
2018 info->segment = segment;
2019 info->bus = bus;
2020 info->devfn = devfn;
2021 info->dev = dev;
2022 info->domain = domain;
2023 if (!dev)
2024 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2025
2026 spin_lock_irqsave(&device_domain_lock, flags);
2027 if (dev)
2028 found = find_domain(dev);
2029 else
2030 found = dmar_search_domain_by_dev_info(segment, bus, devfn);
2031 if (found) {
2032 spin_unlock_irqrestore(&device_domain_lock, flags);
2033 free_devinfo_mem(info);
2034 if (found != domain) {
2035 domain_exit(domain);
2036 *domp = found;
2037 }
2038 } else {
2039 list_add(&info->link, &domain->devices);
2040 list_add(&info->global, &device_domain_list);
2041 if (dev)
2042 dev->dev.archdata.iommu = info;
2043 spin_unlock_irqrestore(&device_domain_lock, flags);
2044 }
2045
2046 return 0;
2047 }
2048
2049 /* domain is initialized */
2050 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
2051 {
2052 struct dmar_domain *domain, *free = NULL;
2053 struct intel_iommu *iommu;
2054 struct dmar_drhd_unit *drhd;
2055 struct pci_dev *dev_tmp;
2056 unsigned long flags;
2057 int bus = 0, devfn = 0;
2058 int segment;
2059
2060 domain = find_domain(pdev);
2061 if (domain)
2062 return domain;
2063
2064 segment = pci_domain_nr(pdev->bus);
2065
2066 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
2067 if (dev_tmp) {
2068 if (pci_is_pcie(dev_tmp)) {
2069 bus = dev_tmp->subordinate->number;
2070 devfn = 0;
2071 } else {
2072 bus = dev_tmp->bus->number;
2073 devfn = dev_tmp->devfn;
2074 }
2075 spin_lock_irqsave(&device_domain_lock, flags);
2076 domain = dmar_search_domain_by_dev_info(segment, bus, devfn);
2077 spin_unlock_irqrestore(&device_domain_lock, flags);
2078 /* pcie-pci bridge already has a domain, uses it */
2079 if (domain)
2080 goto found_domain;
2081 }
2082
2083 drhd = dmar_find_matched_drhd_unit(pdev);
2084 if (!drhd) {
2085 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2086 pci_name(pdev));
2087 return NULL;
2088 }
2089 iommu = drhd->iommu;
2090
2091 /* Allocate and intialize new domain for the device */
2092 domain = alloc_domain(false);
2093 if (!domain)
2094 goto error;
2095 if (iommu_attach_domain(domain, iommu)) {
2096 free_domain_mem(domain);
2097 goto error;
2098 }
2099 free = domain;
2100 if (domain_init(domain, gaw))
2101 goto error;
2102
2103 /* register pcie-to-pci device */
2104 if (dev_tmp) {
2105 if (dmar_insert_dev_info(segment, bus, devfn, NULL, &domain))
2106 goto error;
2107 else
2108 free = NULL;
2109 }
2110
2111 found_domain:
2112 if (dmar_insert_dev_info(segment, pdev->bus->number, pdev->devfn,
2113 pdev, &domain) == 0)
2114 return domain;
2115 error:
2116 if (free)
2117 domain_exit(free);
2118 /* recheck it here, maybe others set it */
2119 return find_domain(pdev);
2120 }
2121
2122 static int iommu_identity_mapping;
2123 #define IDENTMAP_ALL 1
2124 #define IDENTMAP_GFX 2
2125 #define IDENTMAP_AZALIA 4
2126
2127 static int iommu_domain_identity_map(struct dmar_domain *domain,
2128 unsigned long long start,
2129 unsigned long long end)
2130 {
2131 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2132 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2133
2134 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2135 dma_to_mm_pfn(last_vpfn))) {
2136 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2137 return -ENOMEM;
2138 }
2139
2140 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2141 start, end, domain->id);
2142 /*
2143 * RMRR range might have overlap with physical memory range,
2144 * clear it first
2145 */
2146 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2147
2148 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2149 last_vpfn - first_vpfn + 1,
2150 DMA_PTE_READ|DMA_PTE_WRITE);
2151 }
2152
2153 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2154 unsigned long long start,
2155 unsigned long long end)
2156 {
2157 struct dmar_domain *domain;
2158 int ret;
2159
2160 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2161 if (!domain)
2162 return -ENOMEM;
2163
2164 /* For _hardware_ passthrough, don't bother. But for software
2165 passthrough, we do it anyway -- it may indicate a memory
2166 range which is reserved in E820, so which didn't get set
2167 up to start with in si_domain */
2168 if (domain == si_domain && hw_pass_through) {
2169 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2170 pci_name(pdev), start, end);
2171 return 0;
2172 }
2173
2174 printk(KERN_INFO
2175 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2176 pci_name(pdev), start, end);
2177
2178 if (end < start) {
2179 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2180 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2181 dmi_get_system_info(DMI_BIOS_VENDOR),
2182 dmi_get_system_info(DMI_BIOS_VERSION),
2183 dmi_get_system_info(DMI_PRODUCT_VERSION));
2184 ret = -EIO;
2185 goto error;
2186 }
2187
2188 if (end >> agaw_to_width(domain->agaw)) {
2189 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2190 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2191 agaw_to_width(domain->agaw),
2192 dmi_get_system_info(DMI_BIOS_VENDOR),
2193 dmi_get_system_info(DMI_BIOS_VERSION),
2194 dmi_get_system_info(DMI_PRODUCT_VERSION));
2195 ret = -EIO;
2196 goto error;
2197 }
2198
2199 ret = iommu_domain_identity_map(domain, start, end);
2200 if (ret)
2201 goto error;
2202
2203 /* context entry init */
2204 ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2205 if (ret)
2206 goto error;
2207
2208 return 0;
2209
2210 error:
2211 domain_exit(domain);
2212 return ret;
2213 }
2214
2215 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2216 struct pci_dev *pdev)
2217 {
2218 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2219 return 0;
2220 return iommu_prepare_identity_map(pdev, rmrr->base_address,
2221 rmrr->end_address);
2222 }
2223
2224 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2225 static inline void iommu_prepare_isa(void)
2226 {
2227 struct pci_dev *pdev;
2228 int ret;
2229
2230 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2231 if (!pdev)
2232 return;
2233
2234 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2235 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2236
2237 if (ret)
2238 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2239 "floppy might not work\n");
2240
2241 }
2242 #else
2243 static inline void iommu_prepare_isa(void)
2244 {
2245 return;
2246 }
2247 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2248
2249 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2250
2251 static int __init si_domain_init(int hw)
2252 {
2253 struct dmar_drhd_unit *drhd;
2254 struct intel_iommu *iommu;
2255 int nid, ret = 0;
2256
2257 si_domain = alloc_domain(false);
2258 if (!si_domain)
2259 return -EFAULT;
2260
2261 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2262
2263 for_each_active_iommu(iommu, drhd) {
2264 ret = iommu_attach_domain(si_domain, iommu);
2265 if (ret) {
2266 domain_exit(si_domain);
2267 return -EFAULT;
2268 }
2269 }
2270
2271 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2272 domain_exit(si_domain);
2273 return -EFAULT;
2274 }
2275
2276 pr_debug("IOMMU: identity mapping domain is domain %d\n",
2277 si_domain->id);
2278
2279 if (hw)
2280 return 0;
2281
2282 for_each_online_node(nid) {
2283 unsigned long start_pfn, end_pfn;
2284 int i;
2285
2286 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2287 ret = iommu_domain_identity_map(si_domain,
2288 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2289 if (ret)
2290 return ret;
2291 }
2292 }
2293
2294 return 0;
2295 }
2296
2297 static int identity_mapping(struct pci_dev *pdev)
2298 {
2299 struct device_domain_info *info;
2300
2301 if (likely(!iommu_identity_mapping))
2302 return 0;
2303
2304 info = pdev->dev.archdata.iommu;
2305 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2306 return (info->domain == si_domain);
2307
2308 return 0;
2309 }
2310
2311 static int domain_add_dev_info(struct dmar_domain *domain,
2312 struct pci_dev *pdev,
2313 int translation)
2314 {
2315 struct device_domain_info *info;
2316 unsigned long flags;
2317 int ret;
2318
2319 info = alloc_devinfo_mem();
2320 if (!info)
2321 return -ENOMEM;
2322
2323 info->segment = pci_domain_nr(pdev->bus);
2324 info->bus = pdev->bus->number;
2325 info->devfn = pdev->devfn;
2326 info->dev = pdev;
2327 info->domain = domain;
2328
2329 spin_lock_irqsave(&device_domain_lock, flags);
2330 list_add(&info->link, &domain->devices);
2331 list_add(&info->global, &device_domain_list);
2332 pdev->dev.archdata.iommu = info;
2333 spin_unlock_irqrestore(&device_domain_lock, flags);
2334
2335 ret = domain_context_mapping(domain, pdev, translation);
2336 if (ret) {
2337 spin_lock_irqsave(&device_domain_lock, flags);
2338 unlink_domain_info(info);
2339 spin_unlock_irqrestore(&device_domain_lock, flags);
2340 free_devinfo_mem(info);
2341 return ret;
2342 }
2343
2344 return 0;
2345 }
2346
2347 static bool device_has_rmrr(struct pci_dev *dev)
2348 {
2349 struct dmar_rmrr_unit *rmrr;
2350 struct pci_dev *tmp;
2351 int i;
2352
2353 rcu_read_lock();
2354 for_each_rmrr_units(rmrr) {
2355 /*
2356 * Return TRUE if this RMRR contains the device that
2357 * is passed in.
2358 */
2359 for_each_active_dev_scope(rmrr->devices,
2360 rmrr->devices_cnt, i, tmp)
2361 if (tmp == dev) {
2362 rcu_read_unlock();
2363 return true;
2364 }
2365 }
2366 rcu_read_unlock();
2367 return false;
2368 }
2369
2370 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2371 {
2372
2373 /*
2374 * We want to prevent any device associated with an RMRR from
2375 * getting placed into the SI Domain. This is done because
2376 * problems exist when devices are moved in and out of domains
2377 * and their respective RMRR info is lost. We exempt USB devices
2378 * from this process due to their usage of RMRRs that are known
2379 * to not be needed after BIOS hand-off to OS.
2380 */
2381 if (device_has_rmrr(pdev) &&
2382 (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2383 return 0;
2384
2385 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2386 return 1;
2387
2388 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2389 return 1;
2390
2391 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2392 return 0;
2393
2394 /*
2395 * We want to start off with all devices in the 1:1 domain, and
2396 * take them out later if we find they can't access all of memory.
2397 *
2398 * However, we can't do this for PCI devices behind bridges,
2399 * because all PCI devices behind the same bridge will end up
2400 * with the same source-id on their transactions.
2401 *
2402 * Practically speaking, we can't change things around for these
2403 * devices at run-time, because we can't be sure there'll be no
2404 * DMA transactions in flight for any of their siblings.
2405 *
2406 * So PCI devices (unless they're on the root bus) as well as
2407 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2408 * the 1:1 domain, just in _case_ one of their siblings turns out
2409 * not to be able to map all of memory.
2410 */
2411 if (!pci_is_pcie(pdev)) {
2412 if (!pci_is_root_bus(pdev->bus))
2413 return 0;
2414 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2415 return 0;
2416 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2417 return 0;
2418
2419 /*
2420 * At boot time, we don't yet know if devices will be 64-bit capable.
2421 * Assume that they will -- if they turn out not to be, then we can
2422 * take them out of the 1:1 domain later.
2423 */
2424 if (!startup) {
2425 /*
2426 * If the device's dma_mask is less than the system's memory
2427 * size then this is not a candidate for identity mapping.
2428 */
2429 u64 dma_mask = pdev->dma_mask;
2430
2431 if (pdev->dev.coherent_dma_mask &&
2432 pdev->dev.coherent_dma_mask < dma_mask)
2433 dma_mask = pdev->dev.coherent_dma_mask;
2434
2435 return dma_mask >= dma_get_required_mask(&pdev->dev);
2436 }
2437
2438 return 1;
2439 }
2440
2441 static int __init iommu_prepare_static_identity_mapping(int hw)
2442 {
2443 struct pci_dev *pdev = NULL;
2444 int ret;
2445
2446 ret = si_domain_init(hw);
2447 if (ret)
2448 return -EFAULT;
2449
2450 for_each_pci_dev(pdev) {
2451 if (iommu_should_identity_map(pdev, 1)) {
2452 ret = domain_add_dev_info(si_domain, pdev,
2453 hw ? CONTEXT_TT_PASS_THROUGH :
2454 CONTEXT_TT_MULTI_LEVEL);
2455 if (ret) {
2456 /* device not associated with an iommu */
2457 if (ret == -ENODEV)
2458 continue;
2459 return ret;
2460 }
2461 pr_info("IOMMU: %s identity mapping for device %s\n",
2462 hw ? "hardware" : "software", pci_name(pdev));
2463 }
2464 }
2465
2466 return 0;
2467 }
2468
2469 static int __init init_dmars(void)
2470 {
2471 struct dmar_drhd_unit *drhd;
2472 struct dmar_rmrr_unit *rmrr;
2473 struct pci_dev *pdev;
2474 struct intel_iommu *iommu;
2475 int i, ret;
2476
2477 /*
2478 * for each drhd
2479 * allocate root
2480 * initialize and program root entry to not present
2481 * endfor
2482 */
2483 for_each_drhd_unit(drhd) {
2484 /*
2485 * lock not needed as this is only incremented in the single
2486 * threaded kernel __init code path all other access are read
2487 * only
2488 */
2489 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2490 g_num_of_iommus++;
2491 continue;
2492 }
2493 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2494 IOMMU_UNITS_SUPPORTED);
2495 }
2496
2497 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2498 GFP_KERNEL);
2499 if (!g_iommus) {
2500 printk(KERN_ERR "Allocating global iommu array failed\n");
2501 ret = -ENOMEM;
2502 goto error;
2503 }
2504
2505 deferred_flush = kzalloc(g_num_of_iommus *
2506 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2507 if (!deferred_flush) {
2508 ret = -ENOMEM;
2509 goto free_g_iommus;
2510 }
2511
2512 for_each_active_iommu(iommu, drhd) {
2513 g_iommus[iommu->seq_id] = iommu;
2514
2515 ret = iommu_init_domains(iommu);
2516 if (ret)
2517 goto free_iommu;
2518
2519 /*
2520 * TBD:
2521 * we could share the same root & context tables
2522 * among all IOMMU's. Need to Split it later.
2523 */
2524 ret = iommu_alloc_root_entry(iommu);
2525 if (ret) {
2526 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2527 goto free_iommu;
2528 }
2529 if (!ecap_pass_through(iommu->ecap))
2530 hw_pass_through = 0;
2531 }
2532
2533 /*
2534 * Start from the sane iommu hardware state.
2535 */
2536 for_each_active_iommu(iommu, drhd) {
2537 /*
2538 * If the queued invalidation is already initialized by us
2539 * (for example, while enabling interrupt-remapping) then
2540 * we got the things already rolling from a sane state.
2541 */
2542 if (iommu->qi)
2543 continue;
2544
2545 /*
2546 * Clear any previous faults.
2547 */
2548 dmar_fault(-1, iommu);
2549 /*
2550 * Disable queued invalidation if supported and already enabled
2551 * before OS handover.
2552 */
2553 dmar_disable_qi(iommu);
2554 }
2555
2556 for_each_active_iommu(iommu, drhd) {
2557 if (dmar_enable_qi(iommu)) {
2558 /*
2559 * Queued Invalidate not enabled, use Register Based
2560 * Invalidate
2561 */
2562 iommu->flush.flush_context = __iommu_flush_context;
2563 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2564 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2565 "invalidation\n",
2566 iommu->seq_id,
2567 (unsigned long long)drhd->reg_base_addr);
2568 } else {
2569 iommu->flush.flush_context = qi_flush_context;
2570 iommu->flush.flush_iotlb = qi_flush_iotlb;
2571 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2572 "invalidation\n",
2573 iommu->seq_id,
2574 (unsigned long long)drhd->reg_base_addr);
2575 }
2576 }
2577
2578 if (iommu_pass_through)
2579 iommu_identity_mapping |= IDENTMAP_ALL;
2580
2581 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2582 iommu_identity_mapping |= IDENTMAP_GFX;
2583 #endif
2584
2585 check_tylersburg_isoch();
2586
2587 /*
2588 * If pass through is not set or not enabled, setup context entries for
2589 * identity mappings for rmrr, gfx, and isa and may fall back to static
2590 * identity mapping if iommu_identity_mapping is set.
2591 */
2592 if (iommu_identity_mapping) {
2593 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2594 if (ret) {
2595 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2596 goto free_iommu;
2597 }
2598 }
2599 /*
2600 * For each rmrr
2601 * for each dev attached to rmrr
2602 * do
2603 * locate drhd for dev, alloc domain for dev
2604 * allocate free domain
2605 * allocate page table entries for rmrr
2606 * if context not allocated for bus
2607 * allocate and init context
2608 * set present in root table for this bus
2609 * init context with domain, translation etc
2610 * endfor
2611 * endfor
2612 */
2613 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2614 for_each_rmrr_units(rmrr) {
2615 /* some BIOS lists non-exist devices in DMAR table. */
2616 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2617 i, pdev) {
2618 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2619 if (ret)
2620 printk(KERN_ERR
2621 "IOMMU: mapping reserved region failed\n");
2622 }
2623 }
2624
2625 iommu_prepare_isa();
2626
2627 /*
2628 * for each drhd
2629 * enable fault log
2630 * global invalidate context cache
2631 * global invalidate iotlb
2632 * enable translation
2633 */
2634 for_each_iommu(iommu, drhd) {
2635 if (drhd->ignored) {
2636 /*
2637 * we always have to disable PMRs or DMA may fail on
2638 * this device
2639 */
2640 if (force_on)
2641 iommu_disable_protect_mem_regions(iommu);
2642 continue;
2643 }
2644
2645 iommu_flush_write_buffer(iommu);
2646
2647 ret = dmar_set_interrupt(iommu);
2648 if (ret)
2649 goto free_iommu;
2650
2651 iommu_set_root_entry(iommu);
2652
2653 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2654 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2655
2656 ret = iommu_enable_translation(iommu);
2657 if (ret)
2658 goto free_iommu;
2659
2660 iommu_disable_protect_mem_regions(iommu);
2661 }
2662
2663 return 0;
2664
2665 free_iommu:
2666 for_each_active_iommu(iommu, drhd)
2667 free_dmar_iommu(iommu);
2668 kfree(deferred_flush);
2669 free_g_iommus:
2670 kfree(g_iommus);
2671 error:
2672 return ret;
2673 }
2674
2675 /* This takes a number of _MM_ pages, not VTD pages */
2676 static struct iova *intel_alloc_iova(struct device *dev,
2677 struct dmar_domain *domain,
2678 unsigned long nrpages, uint64_t dma_mask)
2679 {
2680 struct pci_dev *pdev = to_pci_dev(dev);
2681 struct iova *iova = NULL;
2682
2683 /* Restrict dma_mask to the width that the iommu can handle */
2684 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2685
2686 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2687 /*
2688 * First try to allocate an io virtual address in
2689 * DMA_BIT_MASK(32) and if that fails then try allocating
2690 * from higher range
2691 */
2692 iova = alloc_iova(&domain->iovad, nrpages,
2693 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2694 if (iova)
2695 return iova;
2696 }
2697 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2698 if (unlikely(!iova)) {
2699 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2700 nrpages, pci_name(pdev));
2701 return NULL;
2702 }
2703
2704 return iova;
2705 }
2706
2707 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2708 {
2709 struct dmar_domain *domain;
2710 int ret;
2711
2712 domain = get_domain_for_dev(pdev,
2713 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2714 if (!domain) {
2715 printk(KERN_ERR
2716 "Allocating domain for %s failed", pci_name(pdev));
2717 return NULL;
2718 }
2719
2720 /* make sure context mapping is ok */
2721 if (unlikely(!domain_context_mapped(pdev))) {
2722 ret = domain_context_mapping(domain, pdev,
2723 CONTEXT_TT_MULTI_LEVEL);
2724 if (ret) {
2725 printk(KERN_ERR
2726 "Domain context map for %s failed",
2727 pci_name(pdev));
2728 return NULL;
2729 }
2730 }
2731
2732 return domain;
2733 }
2734
2735 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2736 {
2737 struct device_domain_info *info;
2738
2739 /* No lock here, assumes no domain exit in normal case */
2740 info = dev->dev.archdata.iommu;
2741 if (likely(info))
2742 return info->domain;
2743
2744 return __get_valid_domain_for_dev(dev);
2745 }
2746
2747 static int iommu_dummy(struct pci_dev *pdev)
2748 {
2749 return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2750 }
2751
2752 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2753 static int iommu_no_mapping(struct device *dev)
2754 {
2755 struct pci_dev *pdev;
2756 int found;
2757
2758 if (unlikely(!dev_is_pci(dev)))
2759 return 1;
2760
2761 pdev = to_pci_dev(dev);
2762 if (iommu_dummy(pdev))
2763 return 1;
2764
2765 if (!iommu_identity_mapping)
2766 return 0;
2767
2768 found = identity_mapping(pdev);
2769 if (found) {
2770 if (iommu_should_identity_map(pdev, 0))
2771 return 1;
2772 else {
2773 /*
2774 * 32 bit DMA is removed from si_domain and fall back
2775 * to non-identity mapping.
2776 */
2777 domain_remove_one_dev_info(si_domain, pdev);
2778 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2779 pci_name(pdev));
2780 return 0;
2781 }
2782 } else {
2783 /*
2784 * In case of a detached 64 bit DMA device from vm, the device
2785 * is put into si_domain for identity mapping.
2786 */
2787 if (iommu_should_identity_map(pdev, 0)) {
2788 int ret;
2789 ret = domain_add_dev_info(si_domain, pdev,
2790 hw_pass_through ?
2791 CONTEXT_TT_PASS_THROUGH :
2792 CONTEXT_TT_MULTI_LEVEL);
2793 if (!ret) {
2794 printk(KERN_INFO "64bit %s uses identity mapping\n",
2795 pci_name(pdev));
2796 return 1;
2797 }
2798 }
2799 }
2800
2801 return 0;
2802 }
2803
2804 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2805 size_t size, int dir, u64 dma_mask)
2806 {
2807 struct pci_dev *pdev = to_pci_dev(hwdev);
2808 struct dmar_domain *domain;
2809 phys_addr_t start_paddr;
2810 struct iova *iova;
2811 int prot = 0;
2812 int ret;
2813 struct intel_iommu *iommu;
2814 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2815
2816 BUG_ON(dir == DMA_NONE);
2817
2818 if (iommu_no_mapping(hwdev))
2819 return paddr;
2820
2821 domain = get_valid_domain_for_dev(pdev);
2822 if (!domain)
2823 return 0;
2824
2825 iommu = domain_get_iommu(domain);
2826 size = aligned_nrpages(paddr, size);
2827
2828 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2829 if (!iova)
2830 goto error;
2831
2832 /*
2833 * Check if DMAR supports zero-length reads on write only
2834 * mappings..
2835 */
2836 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2837 !cap_zlr(iommu->cap))
2838 prot |= DMA_PTE_READ;
2839 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2840 prot |= DMA_PTE_WRITE;
2841 /*
2842 * paddr - (paddr + size) might be partial page, we should map the whole
2843 * page. Note: if two part of one page are separately mapped, we
2844 * might have two guest_addr mapping to the same host paddr, but this
2845 * is not a big problem
2846 */
2847 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2848 mm_to_dma_pfn(paddr_pfn), size, prot);
2849 if (ret)
2850 goto error;
2851
2852 /* it's a non-present to present mapping. Only flush if caching mode */
2853 if (cap_caching_mode(iommu->cap))
2854 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2855 else
2856 iommu_flush_write_buffer(iommu);
2857
2858 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2859 start_paddr += paddr & ~PAGE_MASK;
2860 return start_paddr;
2861
2862 error:
2863 if (iova)
2864 __free_iova(&domain->iovad, iova);
2865 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2866 pci_name(pdev), size, (unsigned long long)paddr, dir);
2867 return 0;
2868 }
2869
2870 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2871 unsigned long offset, size_t size,
2872 enum dma_data_direction dir,
2873 struct dma_attrs *attrs)
2874 {
2875 return __intel_map_single(dev, page_to_phys(page) + offset, size,
2876 dir, to_pci_dev(dev)->dma_mask);
2877 }
2878
2879 static void flush_unmaps(void)
2880 {
2881 int i, j;
2882
2883 timer_on = 0;
2884
2885 /* just flush them all */
2886 for (i = 0; i < g_num_of_iommus; i++) {
2887 struct intel_iommu *iommu = g_iommus[i];
2888 if (!iommu)
2889 continue;
2890
2891 if (!deferred_flush[i].next)
2892 continue;
2893
2894 /* In caching mode, global flushes turn emulation expensive */
2895 if (!cap_caching_mode(iommu->cap))
2896 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2897 DMA_TLB_GLOBAL_FLUSH);
2898 for (j = 0; j < deferred_flush[i].next; j++) {
2899 unsigned long mask;
2900 struct iova *iova = deferred_flush[i].iova[j];
2901 struct dmar_domain *domain = deferred_flush[i].domain[j];
2902
2903 /* On real hardware multiple invalidations are expensive */
2904 if (cap_caching_mode(iommu->cap))
2905 iommu_flush_iotlb_psi(iommu, domain->id,
2906 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2907 else {
2908 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2909 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2910 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2911 }
2912 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2913 }
2914 deferred_flush[i].next = 0;
2915 }
2916
2917 list_size = 0;
2918 }
2919
2920 static void flush_unmaps_timeout(unsigned long data)
2921 {
2922 unsigned long flags;
2923
2924 spin_lock_irqsave(&async_umap_flush_lock, flags);
2925 flush_unmaps();
2926 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2927 }
2928
2929 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2930 {
2931 unsigned long flags;
2932 int next, iommu_id;
2933 struct intel_iommu *iommu;
2934
2935 spin_lock_irqsave(&async_umap_flush_lock, flags);
2936 if (list_size == HIGH_WATER_MARK)
2937 flush_unmaps();
2938
2939 iommu = domain_get_iommu(dom);
2940 iommu_id = iommu->seq_id;
2941
2942 next = deferred_flush[iommu_id].next;
2943 deferred_flush[iommu_id].domain[next] = dom;
2944 deferred_flush[iommu_id].iova[next] = iova;
2945 deferred_flush[iommu_id].next++;
2946
2947 if (!timer_on) {
2948 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2949 timer_on = 1;
2950 }
2951 list_size++;
2952 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2953 }
2954
2955 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2956 size_t size, enum dma_data_direction dir,
2957 struct dma_attrs *attrs)
2958 {
2959 struct pci_dev *pdev = to_pci_dev(dev);
2960 struct dmar_domain *domain;
2961 unsigned long start_pfn, last_pfn;
2962 struct iova *iova;
2963 struct intel_iommu *iommu;
2964
2965 if (iommu_no_mapping(dev))
2966 return;
2967
2968 domain = find_domain(pdev);
2969 BUG_ON(!domain);
2970
2971 iommu = domain_get_iommu(domain);
2972
2973 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2974 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2975 (unsigned long long)dev_addr))
2976 return;
2977
2978 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2979 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2980
2981 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2982 pci_name(pdev), start_pfn, last_pfn);
2983
2984 /* clear the whole page */
2985 dma_pte_clear_range(domain, start_pfn, last_pfn);
2986
2987 /* free page tables */
2988 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2989
2990 if (intel_iommu_strict) {
2991 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2992 last_pfn - start_pfn + 1, 0);
2993 /* free iova */
2994 __free_iova(&domain->iovad, iova);
2995 } else {
2996 add_unmap(domain, iova);
2997 /*
2998 * queue up the release of the unmap to save the 1/6th of the
2999 * cpu used up by the iotlb flush operation...
3000 */
3001 }
3002 }
3003
3004 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3005 dma_addr_t *dma_handle, gfp_t flags,
3006 struct dma_attrs *attrs)
3007 {
3008 void *vaddr;
3009 int order;
3010
3011 size = PAGE_ALIGN(size);
3012 order = get_order(size);
3013
3014 if (!iommu_no_mapping(hwdev))
3015 flags &= ~(GFP_DMA | GFP_DMA32);
3016 else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3017 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3018 flags |= GFP_DMA;
3019 else
3020 flags |= GFP_DMA32;
3021 }
3022
3023 vaddr = (void *)__get_free_pages(flags, order);
3024 if (!vaddr)
3025 return NULL;
3026 memset(vaddr, 0, size);
3027
3028 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3029 DMA_BIDIRECTIONAL,
3030 hwdev->coherent_dma_mask);
3031 if (*dma_handle)
3032 return vaddr;
3033 free_pages((unsigned long)vaddr, order);
3034 return NULL;
3035 }
3036
3037 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3038 dma_addr_t dma_handle, struct dma_attrs *attrs)
3039 {
3040 int order;
3041
3042 size = PAGE_ALIGN(size);
3043 order = get_order(size);
3044
3045 intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3046 free_pages((unsigned long)vaddr, order);
3047 }
3048
3049 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3050 int nelems, enum dma_data_direction dir,
3051 struct dma_attrs *attrs)
3052 {
3053 struct pci_dev *pdev = to_pci_dev(hwdev);
3054 struct dmar_domain *domain;
3055 unsigned long start_pfn, last_pfn;
3056 struct iova *iova;
3057 struct intel_iommu *iommu;
3058
3059 if (iommu_no_mapping(hwdev))
3060 return;
3061
3062 domain = find_domain(pdev);
3063 BUG_ON(!domain);
3064
3065 iommu = domain_get_iommu(domain);
3066
3067 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3068 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3069 (unsigned long long)sglist[0].dma_address))
3070 return;
3071
3072 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3073 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3074
3075 /* clear the whole page */
3076 dma_pte_clear_range(domain, start_pfn, last_pfn);
3077
3078 /* free page tables */
3079 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3080
3081 if (intel_iommu_strict) {
3082 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3083 last_pfn - start_pfn + 1, 0);
3084 /* free iova */
3085 __free_iova(&domain->iovad, iova);
3086 } else {
3087 add_unmap(domain, iova);
3088 /*
3089 * queue up the release of the unmap to save the 1/6th of the
3090 * cpu used up by the iotlb flush operation...
3091 */
3092 }
3093 }
3094
3095 static int intel_nontranslate_map_sg(struct device *hddev,
3096 struct scatterlist *sglist, int nelems, int dir)
3097 {
3098 int i;
3099 struct scatterlist *sg;
3100
3101 for_each_sg(sglist, sg, nelems, i) {
3102 BUG_ON(!sg_page(sg));
3103 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3104 sg->dma_length = sg->length;
3105 }
3106 return nelems;
3107 }
3108
3109 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3110 enum dma_data_direction dir, struct dma_attrs *attrs)
3111 {
3112 int i;
3113 struct pci_dev *pdev = to_pci_dev(hwdev);
3114 struct dmar_domain *domain;
3115 size_t size = 0;
3116 int prot = 0;
3117 struct iova *iova = NULL;
3118 int ret;
3119 struct scatterlist *sg;
3120 unsigned long start_vpfn;
3121 struct intel_iommu *iommu;
3122
3123 BUG_ON(dir == DMA_NONE);
3124 if (iommu_no_mapping(hwdev))
3125 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3126
3127 domain = get_valid_domain_for_dev(pdev);
3128 if (!domain)
3129 return 0;
3130
3131 iommu = domain_get_iommu(domain);
3132
3133 for_each_sg(sglist, sg, nelems, i)
3134 size += aligned_nrpages(sg->offset, sg->length);
3135
3136 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3137 pdev->dma_mask);
3138 if (!iova) {
3139 sglist->dma_length = 0;
3140 return 0;
3141 }
3142
3143 /*
3144 * Check if DMAR supports zero-length reads on write only
3145 * mappings..
3146 */
3147 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3148 !cap_zlr(iommu->cap))
3149 prot |= DMA_PTE_READ;
3150 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3151 prot |= DMA_PTE_WRITE;
3152
3153 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3154
3155 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3156 if (unlikely(ret)) {
3157 /* clear the page */
3158 dma_pte_clear_range(domain, start_vpfn,
3159 start_vpfn + size - 1);
3160 /* free page tables */
3161 dma_pte_free_pagetable(domain, start_vpfn,
3162 start_vpfn + size - 1);
3163 /* free iova */
3164 __free_iova(&domain->iovad, iova);
3165 return 0;
3166 }
3167
3168 /* it's a non-present to present mapping. Only flush if caching mode */
3169 if (cap_caching_mode(iommu->cap))
3170 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3171 else
3172 iommu_flush_write_buffer(iommu);
3173
3174 return nelems;
3175 }
3176
3177 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3178 {
3179 return !dma_addr;
3180 }
3181
3182 struct dma_map_ops intel_dma_ops = {
3183 .alloc = intel_alloc_coherent,
3184 .free = intel_free_coherent,
3185 .map_sg = intel_map_sg,
3186 .unmap_sg = intel_unmap_sg,
3187 .map_page = intel_map_page,
3188 .unmap_page = intel_unmap_page,
3189 .mapping_error = intel_mapping_error,
3190 };
3191
3192 static inline int iommu_domain_cache_init(void)
3193 {
3194 int ret = 0;
3195
3196 iommu_domain_cache = kmem_cache_create("iommu_domain",
3197 sizeof(struct dmar_domain),
3198 0,
3199 SLAB_HWCACHE_ALIGN,
3200
3201 NULL);
3202 if (!iommu_domain_cache) {
3203 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3204 ret = -ENOMEM;
3205 }
3206
3207 return ret;
3208 }
3209
3210 static inline int iommu_devinfo_cache_init(void)
3211 {
3212 int ret = 0;
3213
3214 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3215 sizeof(struct device_domain_info),
3216 0,
3217 SLAB_HWCACHE_ALIGN,
3218 NULL);
3219 if (!iommu_devinfo_cache) {
3220 printk(KERN_ERR "Couldn't create devinfo cache\n");
3221 ret = -ENOMEM;
3222 }
3223
3224 return ret;
3225 }
3226
3227 static inline int iommu_iova_cache_init(void)
3228 {
3229 int ret = 0;
3230
3231 iommu_iova_cache = kmem_cache_create("iommu_iova",
3232 sizeof(struct iova),
3233 0,
3234 SLAB_HWCACHE_ALIGN,
3235 NULL);
3236 if (!iommu_iova_cache) {
3237 printk(KERN_ERR "Couldn't create iova cache\n");
3238 ret = -ENOMEM;
3239 }
3240
3241 return ret;
3242 }
3243
3244 static int __init iommu_init_mempool(void)
3245 {
3246 int ret;
3247 ret = iommu_iova_cache_init();
3248 if (ret)
3249 return ret;
3250
3251 ret = iommu_domain_cache_init();
3252 if (ret)
3253 goto domain_error;
3254
3255 ret = iommu_devinfo_cache_init();
3256 if (!ret)
3257 return ret;
3258
3259 kmem_cache_destroy(iommu_domain_cache);
3260 domain_error:
3261 kmem_cache_destroy(iommu_iova_cache);
3262
3263 return -ENOMEM;
3264 }
3265
3266 static void __init iommu_exit_mempool(void)
3267 {
3268 kmem_cache_destroy(iommu_devinfo_cache);
3269 kmem_cache_destroy(iommu_domain_cache);
3270 kmem_cache_destroy(iommu_iova_cache);
3271
3272 }
3273
3274 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3275 {
3276 struct dmar_drhd_unit *drhd;
3277 u32 vtbar;
3278 int rc;
3279
3280 /* We know that this device on this chipset has its own IOMMU.
3281 * If we find it under a different IOMMU, then the BIOS is lying
3282 * to us. Hope that the IOMMU for this device is actually
3283 * disabled, and it needs no translation...
3284 */
3285 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3286 if (rc) {
3287 /* "can't" happen */
3288 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3289 return;
3290 }
3291 vtbar &= 0xffff0000;
3292
3293 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3294 drhd = dmar_find_matched_drhd_unit(pdev);
3295 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3296 TAINT_FIRMWARE_WORKAROUND,
3297 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3298 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3299 }
3300 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3301
3302 static void __init init_no_remapping_devices(void)
3303 {
3304 struct dmar_drhd_unit *drhd;
3305 struct pci_dev *dev;
3306 int i;
3307
3308 for_each_drhd_unit(drhd) {
3309 if (!drhd->include_all) {
3310 for_each_active_dev_scope(drhd->devices,
3311 drhd->devices_cnt, i, dev)
3312 break;
3313 /* ignore DMAR unit if no pci devices exist */
3314 if (i == drhd->devices_cnt)
3315 drhd->ignored = 1;
3316 }
3317 }
3318
3319 for_each_active_drhd_unit(drhd) {
3320 if (drhd->include_all)
3321 continue;
3322
3323 for_each_active_dev_scope(drhd->devices,
3324 drhd->devices_cnt, i, dev)
3325 if (!IS_GFX_DEVICE(dev))
3326 break;
3327 if (i < drhd->devices_cnt)
3328 continue;
3329
3330 /* This IOMMU has *only* gfx devices. Either bypass it or
3331 set the gfx_mapped flag, as appropriate */
3332 if (dmar_map_gfx) {
3333 intel_iommu_gfx_mapped = 1;
3334 } else {
3335 drhd->ignored = 1;
3336 for_each_active_dev_scope(drhd->devices,
3337 drhd->devices_cnt, i, dev)
3338 dev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3339 }
3340 }
3341 }
3342
3343 #ifdef CONFIG_SUSPEND
3344 static int init_iommu_hw(void)
3345 {
3346 struct dmar_drhd_unit *drhd;
3347 struct intel_iommu *iommu = NULL;
3348
3349 for_each_active_iommu(iommu, drhd)
3350 if (iommu->qi)
3351 dmar_reenable_qi(iommu);
3352
3353 for_each_iommu(iommu, drhd) {
3354 if (drhd->ignored) {
3355 /*
3356 * we always have to disable PMRs or DMA may fail on
3357 * this device
3358 */
3359 if (force_on)
3360 iommu_disable_protect_mem_regions(iommu);
3361 continue;
3362 }
3363
3364 iommu_flush_write_buffer(iommu);
3365
3366 iommu_set_root_entry(iommu);
3367
3368 iommu->flush.flush_context(iommu, 0, 0, 0,
3369 DMA_CCMD_GLOBAL_INVL);
3370 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3371 DMA_TLB_GLOBAL_FLUSH);
3372 if (iommu_enable_translation(iommu))
3373 return 1;
3374 iommu_disable_protect_mem_regions(iommu);
3375 }
3376
3377 return 0;
3378 }
3379
3380 static void iommu_flush_all(void)
3381 {
3382 struct dmar_drhd_unit *drhd;
3383 struct intel_iommu *iommu;
3384
3385 for_each_active_iommu(iommu, drhd) {
3386 iommu->flush.flush_context(iommu, 0, 0, 0,
3387 DMA_CCMD_GLOBAL_INVL);
3388 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3389 DMA_TLB_GLOBAL_FLUSH);
3390 }
3391 }
3392
3393 static int iommu_suspend(void)
3394 {
3395 struct dmar_drhd_unit *drhd;
3396 struct intel_iommu *iommu = NULL;
3397 unsigned long flag;
3398
3399 for_each_active_iommu(iommu, drhd) {
3400 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3401 GFP_ATOMIC);
3402 if (!iommu->iommu_state)
3403 goto nomem;
3404 }
3405
3406 iommu_flush_all();
3407
3408 for_each_active_iommu(iommu, drhd) {
3409 iommu_disable_translation(iommu);
3410
3411 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3412
3413 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3414 readl(iommu->reg + DMAR_FECTL_REG);
3415 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3416 readl(iommu->reg + DMAR_FEDATA_REG);
3417 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3418 readl(iommu->reg + DMAR_FEADDR_REG);
3419 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3420 readl(iommu->reg + DMAR_FEUADDR_REG);
3421
3422 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3423 }
3424 return 0;
3425
3426 nomem:
3427 for_each_active_iommu(iommu, drhd)
3428 kfree(iommu->iommu_state);
3429
3430 return -ENOMEM;
3431 }
3432
3433 static void iommu_resume(void)
3434 {
3435 struct dmar_drhd_unit *drhd;
3436 struct intel_iommu *iommu = NULL;
3437 unsigned long flag;
3438
3439 if (init_iommu_hw()) {
3440 if (force_on)
3441 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3442 else
3443 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3444 return;
3445 }
3446
3447 for_each_active_iommu(iommu, drhd) {
3448
3449 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3450
3451 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3452 iommu->reg + DMAR_FECTL_REG);
3453 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3454 iommu->reg + DMAR_FEDATA_REG);
3455 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3456 iommu->reg + DMAR_FEADDR_REG);
3457 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3458 iommu->reg + DMAR_FEUADDR_REG);
3459
3460 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3461 }
3462
3463 for_each_active_iommu(iommu, drhd)
3464 kfree(iommu->iommu_state);
3465 }
3466
3467 static struct syscore_ops iommu_syscore_ops = {
3468 .resume = iommu_resume,
3469 .suspend = iommu_suspend,
3470 };
3471
3472 static void __init init_iommu_pm_ops(void)
3473 {
3474 register_syscore_ops(&iommu_syscore_ops);
3475 }
3476
3477 #else
3478 static inline void init_iommu_pm_ops(void) {}
3479 #endif /* CONFIG_PM */
3480
3481
3482 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3483 {
3484 struct acpi_dmar_reserved_memory *rmrr;
3485 struct dmar_rmrr_unit *rmrru;
3486
3487 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3488 if (!rmrru)
3489 return -ENOMEM;
3490
3491 rmrru->hdr = header;
3492 rmrr = (struct acpi_dmar_reserved_memory *)header;
3493 rmrru->base_address = rmrr->base_address;
3494 rmrru->end_address = rmrr->end_address;
3495 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3496 ((void *)rmrr) + rmrr->header.length,
3497 &rmrru->devices_cnt);
3498 if (rmrru->devices_cnt && rmrru->devices == NULL) {
3499 kfree(rmrru);
3500 return -ENOMEM;
3501 }
3502
3503 list_add(&rmrru->list, &dmar_rmrr_units);
3504
3505 return 0;
3506 }
3507
3508 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3509 {
3510 struct acpi_dmar_atsr *atsr;
3511 struct dmar_atsr_unit *atsru;
3512
3513 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3514 atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3515 if (!atsru)
3516 return -ENOMEM;
3517
3518 atsru->hdr = hdr;
3519 atsru->include_all = atsr->flags & 0x1;
3520 if (!atsru->include_all) {
3521 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3522 (void *)atsr + atsr->header.length,
3523 &atsru->devices_cnt);
3524 if (atsru->devices_cnt && atsru->devices == NULL) {
3525 kfree(atsru);
3526 return -ENOMEM;
3527 }
3528 }
3529
3530 list_add_rcu(&atsru->list, &dmar_atsr_units);
3531
3532 return 0;
3533 }
3534
3535 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3536 {
3537 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3538 kfree(atsru);
3539 }
3540
3541 static void intel_iommu_free_dmars(void)
3542 {
3543 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3544 struct dmar_atsr_unit *atsru, *atsr_n;
3545
3546 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3547 list_del(&rmrru->list);
3548 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3549 kfree(rmrru);
3550 }
3551
3552 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3553 list_del(&atsru->list);
3554 intel_iommu_free_atsr(atsru);
3555 }
3556 }
3557
3558 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3559 {
3560 int i, ret = 1;
3561 struct pci_bus *bus;
3562 struct pci_dev *bridge = NULL, *tmp;
3563 struct acpi_dmar_atsr *atsr;
3564 struct dmar_atsr_unit *atsru;
3565
3566 dev = pci_physfn(dev);
3567 for (bus = dev->bus; bus; bus = bus->parent) {
3568 bridge = bus->self;
3569 if (!bridge || !pci_is_pcie(bridge) ||
3570 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3571 return 0;
3572 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3573 break;
3574 }
3575 if (!bridge)
3576 return 0;
3577
3578 rcu_read_lock();
3579 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3580 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3581 if (atsr->segment != pci_domain_nr(dev->bus))
3582 continue;
3583
3584 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3585 if (tmp == bridge)
3586 goto out;
3587
3588 if (atsru->include_all)
3589 goto out;
3590 }
3591 ret = 0;
3592 out:
3593 rcu_read_unlock();
3594
3595 return ret;
3596 }
3597
3598 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3599 {
3600 int ret = 0;
3601 struct dmar_rmrr_unit *rmrru;
3602 struct dmar_atsr_unit *atsru;
3603 struct acpi_dmar_atsr *atsr;
3604 struct acpi_dmar_reserved_memory *rmrr;
3605
3606 if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3607 return 0;
3608
3609 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3610 rmrr = container_of(rmrru->hdr,
3611 struct acpi_dmar_reserved_memory, header);
3612 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3613 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3614 ((void *)rmrr) + rmrr->header.length,
3615 rmrr->segment, rmrru->devices,
3616 rmrru->devices_cnt);
3617 if (ret > 0)
3618 break;
3619 else if(ret < 0)
3620 return ret;
3621 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3622 if (dmar_remove_dev_scope(info, rmrr->segment,
3623 rmrru->devices, rmrru->devices_cnt))
3624 break;
3625 }
3626 }
3627
3628 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3629 if (atsru->include_all)
3630 continue;
3631
3632 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3633 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3634 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3635 (void *)atsr + atsr->header.length,
3636 atsr->segment, atsru->devices,
3637 atsru->devices_cnt);
3638 if (ret > 0)
3639 break;
3640 else if(ret < 0)
3641 return ret;
3642 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3643 if (dmar_remove_dev_scope(info, atsr->segment,
3644 atsru->devices, atsru->devices_cnt))
3645 break;
3646 }
3647 }
3648
3649 return 0;
3650 }
3651
3652 /*
3653 * Here we only respond to action of unbound device from driver.
3654 *
3655 * Added device is not attached to its DMAR domain here yet. That will happen
3656 * when mapping the device to iova.
3657 */
3658 static int device_notifier(struct notifier_block *nb,
3659 unsigned long action, void *data)
3660 {
3661 struct device *dev = data;
3662 struct pci_dev *pdev = to_pci_dev(dev);
3663 struct dmar_domain *domain;
3664
3665 if (iommu_dummy(pdev))
3666 return 0;
3667
3668 if (action != BUS_NOTIFY_UNBOUND_DRIVER &&
3669 action != BUS_NOTIFY_DEL_DEVICE)
3670 return 0;
3671
3672 domain = find_domain(pdev);
3673 if (!domain)
3674 return 0;
3675
3676 down_read(&dmar_global_lock);
3677 domain_remove_one_dev_info(domain, pdev);
3678 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3679 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3680 list_empty(&domain->devices))
3681 domain_exit(domain);
3682 up_read(&dmar_global_lock);
3683
3684 return 0;
3685 }
3686
3687 static struct notifier_block device_nb = {
3688 .notifier_call = device_notifier,
3689 };
3690
3691 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3692 unsigned long val, void *v)
3693 {
3694 struct memory_notify *mhp = v;
3695 unsigned long long start, end;
3696 unsigned long start_vpfn, last_vpfn;
3697
3698 switch (val) {
3699 case MEM_GOING_ONLINE:
3700 start = mhp->start_pfn << PAGE_SHIFT;
3701 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
3702 if (iommu_domain_identity_map(si_domain, start, end)) {
3703 pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
3704 start, end);
3705 return NOTIFY_BAD;
3706 }
3707 break;
3708
3709 case MEM_OFFLINE:
3710 case MEM_CANCEL_ONLINE:
3711 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3712 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
3713 while (start_vpfn <= last_vpfn) {
3714 struct iova *iova;
3715 struct dmar_drhd_unit *drhd;
3716 struct intel_iommu *iommu;
3717
3718 iova = find_iova(&si_domain->iovad, start_vpfn);
3719 if (iova == NULL) {
3720 pr_debug("dmar: failed get IOVA for PFN %lx\n",
3721 start_vpfn);
3722 break;
3723 }
3724
3725 iova = split_and_remove_iova(&si_domain->iovad, iova,
3726 start_vpfn, last_vpfn);
3727 if (iova == NULL) {
3728 pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
3729 start_vpfn, last_vpfn);
3730 return NOTIFY_BAD;
3731 }
3732
3733 rcu_read_lock();
3734 for_each_active_iommu(iommu, drhd)
3735 iommu_flush_iotlb_psi(iommu, si_domain->id,
3736 iova->pfn_lo,
3737 iova->pfn_hi - iova->pfn_lo + 1, 0);
3738 rcu_read_unlock();
3739 dma_pte_clear_range(si_domain, iova->pfn_lo,
3740 iova->pfn_hi);
3741 dma_pte_free_pagetable(si_domain, iova->pfn_lo,
3742 iova->pfn_hi);
3743
3744 start_vpfn = iova->pfn_hi + 1;
3745 free_iova_mem(iova);
3746 }
3747 break;
3748 }
3749
3750 return NOTIFY_OK;
3751 }
3752
3753 static struct notifier_block intel_iommu_memory_nb = {
3754 .notifier_call = intel_iommu_memory_notifier,
3755 .priority = 0
3756 };
3757
3758 int __init intel_iommu_init(void)
3759 {
3760 int ret = -ENODEV;
3761 struct dmar_drhd_unit *drhd;
3762 struct intel_iommu *iommu;
3763
3764 /* VT-d is required for a TXT/tboot launch, so enforce that */
3765 force_on = tboot_force_iommu();
3766
3767 if (iommu_init_mempool()) {
3768 if (force_on)
3769 panic("tboot: Failed to initialize iommu memory\n");
3770 return -ENOMEM;
3771 }
3772
3773 down_write(&dmar_global_lock);
3774 if (dmar_table_init()) {
3775 if (force_on)
3776 panic("tboot: Failed to initialize DMAR table\n");
3777 goto out_free_dmar;
3778 }
3779
3780 /*
3781 * Disable translation if already enabled prior to OS handover.
3782 */
3783 for_each_active_iommu(iommu, drhd)
3784 if (iommu->gcmd & DMA_GCMD_TE)
3785 iommu_disable_translation(iommu);
3786
3787 if (dmar_dev_scope_init() < 0) {
3788 if (force_on)
3789 panic("tboot: Failed to initialize DMAR device scope\n");
3790 goto out_free_dmar;
3791 }
3792
3793 if (no_iommu || dmar_disabled)
3794 goto out_free_dmar;
3795
3796 if (list_empty(&dmar_rmrr_units))
3797 printk(KERN_INFO "DMAR: No RMRR found\n");
3798
3799 if (list_empty(&dmar_atsr_units))
3800 printk(KERN_INFO "DMAR: No ATSR found\n");
3801
3802 if (dmar_init_reserved_ranges()) {
3803 if (force_on)
3804 panic("tboot: Failed to reserve iommu ranges\n");
3805 goto out_free_reserved_range;
3806 }
3807
3808 init_no_remapping_devices();
3809
3810 ret = init_dmars();
3811 if (ret) {
3812 if (force_on)
3813 panic("tboot: Failed to initialize DMARs\n");
3814 printk(KERN_ERR "IOMMU: dmar init failed\n");
3815 goto out_free_reserved_range;
3816 }
3817 up_write(&dmar_global_lock);
3818 printk(KERN_INFO
3819 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3820
3821 init_timer(&unmap_timer);
3822 #ifdef CONFIG_SWIOTLB
3823 swiotlb = 0;
3824 #endif
3825 dma_ops = &intel_dma_ops;
3826
3827 init_iommu_pm_ops();
3828
3829 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3830 bus_register_notifier(&pci_bus_type, &device_nb);
3831 if (si_domain && !hw_pass_through)
3832 register_memory_notifier(&intel_iommu_memory_nb);
3833
3834 intel_iommu_enabled = 1;
3835
3836 return 0;
3837
3838 out_free_reserved_range:
3839 put_iova_domain(&reserved_iova_list);
3840 out_free_dmar:
3841 intel_iommu_free_dmars();
3842 up_write(&dmar_global_lock);
3843 iommu_exit_mempool();
3844 return ret;
3845 }
3846
3847 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3848 struct pci_dev *pdev)
3849 {
3850 struct pci_dev *tmp, *parent;
3851
3852 if (!iommu || !pdev)
3853 return;
3854
3855 /* dependent device detach */
3856 tmp = pci_find_upstream_pcie_bridge(pdev);
3857 /* Secondary interface's bus number and devfn 0 */
3858 if (tmp) {
3859 parent = pdev->bus->self;
3860 while (parent != tmp) {
3861 iommu_detach_dev(iommu, parent->bus->number,
3862 parent->devfn);
3863 parent = parent->bus->self;
3864 }
3865 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3866 iommu_detach_dev(iommu,
3867 tmp->subordinate->number, 0);
3868 else /* this is a legacy PCI bridge */
3869 iommu_detach_dev(iommu, tmp->bus->number,
3870 tmp->devfn);
3871 }
3872 }
3873
3874 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3875 struct pci_dev *pdev)
3876 {
3877 struct device_domain_info *info, *tmp;
3878 struct intel_iommu *iommu;
3879 unsigned long flags;
3880 int found = 0;
3881
3882 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3883 pdev->devfn);
3884 if (!iommu)
3885 return;
3886
3887 spin_lock_irqsave(&device_domain_lock, flags);
3888 list_for_each_entry_safe(info, tmp, &domain->devices, link) {
3889 if (info->segment == pci_domain_nr(pdev->bus) &&
3890 info->bus == pdev->bus->number &&
3891 info->devfn == pdev->devfn) {
3892 unlink_domain_info(info);
3893 spin_unlock_irqrestore(&device_domain_lock, flags);
3894
3895 iommu_disable_dev_iotlb(info);
3896 iommu_detach_dev(iommu, info->bus, info->devfn);
3897 iommu_detach_dependent_devices(iommu, pdev);
3898 free_devinfo_mem(info);
3899
3900 spin_lock_irqsave(&device_domain_lock, flags);
3901
3902 if (found)
3903 break;
3904 else
3905 continue;
3906 }
3907
3908 /* if there is no other devices under the same iommu
3909 * owned by this domain, clear this iommu in iommu_bmp
3910 * update iommu count and coherency
3911 */
3912 if (iommu == device_to_iommu(info->segment, info->bus,
3913 info->devfn))
3914 found = 1;
3915 }
3916
3917 spin_unlock_irqrestore(&device_domain_lock, flags);
3918
3919 if (found == 0) {
3920 unsigned long tmp_flags;
3921 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3922 clear_bit(iommu->seq_id, domain->iommu_bmp);
3923 domain->iommu_count--;
3924 domain_update_iommu_cap(domain);
3925 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3926
3927 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3928 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3929 spin_lock_irqsave(&iommu->lock, tmp_flags);
3930 clear_bit(domain->id, iommu->domain_ids);
3931 iommu->domains[domain->id] = NULL;
3932 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3933 }
3934 }
3935 }
3936
3937 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3938 {
3939 int adjust_width;
3940
3941 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3942 domain_reserve_special_ranges(domain);
3943
3944 /* calculate AGAW */
3945 domain->gaw = guest_width;
3946 adjust_width = guestwidth_to_adjustwidth(guest_width);
3947 domain->agaw = width_to_agaw(adjust_width);
3948
3949 domain->iommu_coherency = 0;
3950 domain->iommu_snooping = 0;
3951 domain->iommu_superpage = 0;
3952 domain->max_addr = 0;
3953 domain->nid = -1;
3954
3955 /* always allocate the top pgd */
3956 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3957 if (!domain->pgd)
3958 return -ENOMEM;
3959 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3960 return 0;
3961 }
3962
3963 static int intel_iommu_domain_init(struct iommu_domain *domain)
3964 {
3965 struct dmar_domain *dmar_domain;
3966
3967 dmar_domain = alloc_domain(true);
3968 if (!dmar_domain) {
3969 printk(KERN_ERR
3970 "intel_iommu_domain_init: dmar_domain == NULL\n");
3971 return -ENOMEM;
3972 }
3973 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3974 printk(KERN_ERR
3975 "intel_iommu_domain_init() failed\n");
3976 domain_exit(dmar_domain);
3977 return -ENOMEM;
3978 }
3979 domain_update_iommu_cap(dmar_domain);
3980 domain->priv = dmar_domain;
3981
3982 domain->geometry.aperture_start = 0;
3983 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3984 domain->geometry.force_aperture = true;
3985
3986 return 0;
3987 }
3988
3989 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3990 {
3991 struct dmar_domain *dmar_domain = domain->priv;
3992
3993 domain->priv = NULL;
3994 domain_exit(dmar_domain);
3995 }
3996
3997 static int intel_iommu_attach_device(struct iommu_domain *domain,
3998 struct device *dev)
3999 {
4000 struct dmar_domain *dmar_domain = domain->priv;
4001 struct pci_dev *pdev = to_pci_dev(dev);
4002 struct intel_iommu *iommu;
4003 int addr_width;
4004
4005 /* normally pdev is not mapped */
4006 if (unlikely(domain_context_mapped(pdev))) {
4007 struct dmar_domain *old_domain;
4008
4009 old_domain = find_domain(pdev);
4010 if (old_domain) {
4011 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4012 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4013 domain_remove_one_dev_info(old_domain, pdev);
4014 else
4015 domain_remove_dev_info(old_domain);
4016 }
4017 }
4018
4019 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4020 pdev->devfn);
4021 if (!iommu)
4022 return -ENODEV;
4023
4024 /* check if this iommu agaw is sufficient for max mapped address */
4025 addr_width = agaw_to_width(iommu->agaw);
4026 if (addr_width > cap_mgaw(iommu->cap))
4027 addr_width = cap_mgaw(iommu->cap);
4028
4029 if (dmar_domain->max_addr > (1LL << addr_width)) {
4030 printk(KERN_ERR "%s: iommu width (%d) is not "
4031 "sufficient for the mapped address (%llx)\n",
4032 __func__, addr_width, dmar_domain->max_addr);
4033 return -EFAULT;
4034 }
4035 dmar_domain->gaw = addr_width;
4036
4037 /*
4038 * Knock out extra levels of page tables if necessary
4039 */
4040 while (iommu->agaw < dmar_domain->agaw) {
4041 struct dma_pte *pte;
4042
4043 pte = dmar_domain->pgd;
4044 if (dma_pte_present(pte)) {
4045 dmar_domain->pgd = (struct dma_pte *)
4046 phys_to_virt(dma_pte_addr(pte));
4047 free_pgtable_page(pte);
4048 }
4049 dmar_domain->agaw--;
4050 }
4051
4052 return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4053 }
4054
4055 static void intel_iommu_detach_device(struct iommu_domain *domain,
4056 struct device *dev)
4057 {
4058 struct dmar_domain *dmar_domain = domain->priv;
4059 struct pci_dev *pdev = to_pci_dev(dev);
4060
4061 domain_remove_one_dev_info(dmar_domain, pdev);
4062 }
4063
4064 static int intel_iommu_map(struct iommu_domain *domain,
4065 unsigned long iova, phys_addr_t hpa,
4066 size_t size, int iommu_prot)
4067 {
4068 struct dmar_domain *dmar_domain = domain->priv;
4069 u64 max_addr;
4070 int prot = 0;
4071 int ret;
4072
4073 if (iommu_prot & IOMMU_READ)
4074 prot |= DMA_PTE_READ;
4075 if (iommu_prot & IOMMU_WRITE)
4076 prot |= DMA_PTE_WRITE;
4077 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4078 prot |= DMA_PTE_SNP;
4079
4080 max_addr = iova + size;
4081 if (dmar_domain->max_addr < max_addr) {
4082 u64 end;
4083
4084 /* check if minimum agaw is sufficient for mapped address */
4085 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4086 if (end < max_addr) {
4087 printk(KERN_ERR "%s: iommu width (%d) is not "
4088 "sufficient for the mapped address (%llx)\n",
4089 __func__, dmar_domain->gaw, max_addr);
4090 return -EFAULT;
4091 }
4092 dmar_domain->max_addr = max_addr;
4093 }
4094 /* Round up size to next multiple of PAGE_SIZE, if it and
4095 the low bits of hpa would take us onto the next page */
4096 size = aligned_nrpages(hpa, size);
4097 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4098 hpa >> VTD_PAGE_SHIFT, size, prot);
4099 return ret;
4100 }
4101
4102 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4103 unsigned long iova, size_t size)
4104 {
4105 struct dmar_domain *dmar_domain = domain->priv;
4106 int level = 0;
4107
4108 /* Cope with horrid API which requires us to unmap more than the
4109 size argument if it happens to be a large-page mapping. */
4110 if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4111 BUG();
4112
4113 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4114 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4115
4116 dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4117 (iova + size - 1) >> VTD_PAGE_SHIFT);
4118
4119 if (dmar_domain->max_addr == iova + size)
4120 dmar_domain->max_addr = iova;
4121
4122 return size;
4123 }
4124
4125 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4126 dma_addr_t iova)
4127 {
4128 struct dmar_domain *dmar_domain = domain->priv;
4129 struct dma_pte *pte;
4130 int level = 0;
4131 u64 phys = 0;
4132
4133 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4134 if (pte)
4135 phys = dma_pte_addr(pte);
4136
4137 return phys;
4138 }
4139
4140 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4141 unsigned long cap)
4142 {
4143 struct dmar_domain *dmar_domain = domain->priv;
4144
4145 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4146 return dmar_domain->iommu_snooping;
4147 if (cap == IOMMU_CAP_INTR_REMAP)
4148 return irq_remapping_enabled;
4149
4150 return 0;
4151 }
4152
4153 #define REQ_ACS_FLAGS (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4154
4155 static int intel_iommu_add_device(struct device *dev)
4156 {
4157 struct pci_dev *pdev = to_pci_dev(dev);
4158 struct pci_dev *bridge, *dma_pdev = NULL;
4159 struct iommu_group *group;
4160 int ret;
4161
4162 if (!device_to_iommu(pci_domain_nr(pdev->bus),
4163 pdev->bus->number, pdev->devfn))
4164 return -ENODEV;
4165
4166 bridge = pci_find_upstream_pcie_bridge(pdev);
4167 if (bridge) {
4168 if (pci_is_pcie(bridge))
4169 dma_pdev = pci_get_domain_bus_and_slot(
4170 pci_domain_nr(pdev->bus),
4171 bridge->subordinate->number, 0);
4172 if (!dma_pdev)
4173 dma_pdev = pci_dev_get(bridge);
4174 } else
4175 dma_pdev = pci_dev_get(pdev);
4176
4177 /* Account for quirked devices */
4178 swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4179
4180 /*
4181 * If it's a multifunction device that does not support our
4182 * required ACS flags, add to the same group as lowest numbered
4183 * function that also does not suport the required ACS flags.
4184 */
4185 if (dma_pdev->multifunction &&
4186 !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS)) {
4187 u8 i, slot = PCI_SLOT(dma_pdev->devfn);
4188
4189 for (i = 0; i < 8; i++) {
4190 struct pci_dev *tmp;
4191
4192 tmp = pci_get_slot(dma_pdev->bus, PCI_DEVFN(slot, i));
4193 if (!tmp)
4194 continue;
4195
4196 if (!pci_acs_enabled(tmp, REQ_ACS_FLAGS)) {
4197 swap_pci_ref(&dma_pdev, tmp);
4198 break;
4199 }
4200 pci_dev_put(tmp);
4201 }
4202 }
4203
4204 /*
4205 * Devices on the root bus go through the iommu. If that's not us,
4206 * find the next upstream device and test ACS up to the root bus.
4207 * Finding the next device may require skipping virtual buses.
4208 */
4209 while (!pci_is_root_bus(dma_pdev->bus)) {
4210 struct pci_bus *bus = dma_pdev->bus;
4211
4212 while (!bus->self) {
4213 if (!pci_is_root_bus(bus))
4214 bus = bus->parent;
4215 else
4216 goto root_bus;
4217 }
4218
4219 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4220 break;
4221
4222 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4223 }
4224
4225 root_bus:
4226 group = iommu_group_get(&dma_pdev->dev);
4227 pci_dev_put(dma_pdev);
4228 if (!group) {
4229 group = iommu_group_alloc();
4230 if (IS_ERR(group))
4231 return PTR_ERR(group);
4232 }
4233
4234 ret = iommu_group_add_device(group, dev);
4235
4236 iommu_group_put(group);
4237 return ret;
4238 }
4239
4240 static void intel_iommu_remove_device(struct device *dev)
4241 {
4242 iommu_group_remove_device(dev);
4243 }
4244
4245 static struct iommu_ops intel_iommu_ops = {
4246 .domain_init = intel_iommu_domain_init,
4247 .domain_destroy = intel_iommu_domain_destroy,
4248 .attach_dev = intel_iommu_attach_device,
4249 .detach_dev = intel_iommu_detach_device,
4250 .map = intel_iommu_map,
4251 .unmap = intel_iommu_unmap,
4252 .iova_to_phys = intel_iommu_iova_to_phys,
4253 .domain_has_cap = intel_iommu_domain_has_cap,
4254 .add_device = intel_iommu_add_device,
4255 .remove_device = intel_iommu_remove_device,
4256 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4257 };
4258
4259 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4260 {
4261 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4262 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4263 dmar_map_gfx = 0;
4264 }
4265
4266 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4267 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4268 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4269 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4270 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4271 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4272 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4273
4274 static void quirk_iommu_rwbf(struct pci_dev *dev)
4275 {
4276 /*
4277 * Mobile 4 Series Chipset neglects to set RWBF capability,
4278 * but needs it. Same seems to hold for the desktop versions.
4279 */
4280 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4281 rwbf_quirk = 1;
4282 }
4283
4284 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4285 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4286 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4287 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4288 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4289 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4290 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4291
4292 #define GGC 0x52
4293 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4294 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4295 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4296 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4297 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4298 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4299 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4300 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4301
4302 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4303 {
4304 unsigned short ggc;
4305
4306 if (pci_read_config_word(dev, GGC, &ggc))
4307 return;
4308
4309 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4310 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4311 dmar_map_gfx = 0;
4312 } else if (dmar_map_gfx) {
4313 /* we have to ensure the gfx device is idle before we flush */
4314 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4315 intel_iommu_strict = 1;
4316 }
4317 }
4318 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4319 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4320 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4321 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4322
4323 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4324 ISOCH DMAR unit for the Azalia sound device, but not give it any
4325 TLB entries, which causes it to deadlock. Check for that. We do
4326 this in a function called from init_dmars(), instead of in a PCI
4327 quirk, because we don't want to print the obnoxious "BIOS broken"
4328 message if VT-d is actually disabled.
4329 */
4330 static void __init check_tylersburg_isoch(void)
4331 {
4332 struct pci_dev *pdev;
4333 uint32_t vtisochctrl;
4334
4335 /* If there's no Azalia in the system anyway, forget it. */
4336 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4337 if (!pdev)
4338 return;
4339 pci_dev_put(pdev);
4340
4341 /* System Management Registers. Might be hidden, in which case
4342 we can't do the sanity check. But that's OK, because the
4343 known-broken BIOSes _don't_ actually hide it, so far. */
4344 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4345 if (!pdev)
4346 return;
4347
4348 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4349 pci_dev_put(pdev);
4350 return;
4351 }
4352
4353 pci_dev_put(pdev);
4354
4355 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4356 if (vtisochctrl & 1)
4357 return;
4358
4359 /* Drop all bits other than the number of TLB entries */
4360 vtisochctrl &= 0x1c;
4361
4362 /* If we have the recommended number of TLB entries (16), fine. */
4363 if (vtisochctrl == 0x10)
4364 return;
4365
4366 /* Zero TLB entries? You get to ride the short bus to school. */
4367 if (!vtisochctrl) {
4368 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4369 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4370 dmi_get_system_info(DMI_BIOS_VENDOR),
4371 dmi_get_system_info(DMI_BIOS_VERSION),
4372 dmi_get_system_info(DMI_PRODUCT_VERSION));
4373 iommu_identity_mapping |= IDENTMAP_AZALIA;
4374 return;
4375 }
4376
4377 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4378 vtisochctrl);
4379 }
This page took 0.196741 seconds and 5 git commands to generate.