Merge git://git.infradead.org/intel-iommu
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 27 May 2016 20:49:24 +0000 (13:49 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 27 May 2016 20:49:24 +0000 (13:49 -0700)
Pull intel IOMMU updates from David Woodhouse:
 "This patchset improves the scalability of the Intel IOMMU code by
  resolving two spinlock bottlenecks and eliminating the linearity of
  the IOVA allocator, yielding up to ~5x performance improvement and
  approaching 'iommu=off' performance"

* git://git.infradead.org/intel-iommu:
  iommu/vt-d: Use per-cpu IOVA caching
  iommu/iova: introduce per-cpu caching to iova allocation
  iommu/vt-d: change intel-iommu to use IOVA frame numbers
  iommu/vt-d: avoid dev iotlb logic for domains with no dev iotlbs
  iommu/vt-d: only unmap mapped entries
  iommu/vt-d: correct flush_unmaps pfn usage
  iommu/vt-d: per-cpu deferred invalidation queues
  iommu/vt-d: refactoring of deferred flush entries

drivers/iommu/intel-iommu.c
drivers/iommu/iova.c
include/linux/iova.h

index b2bfb9594508feea4169a9c7704d25c95acf1490..a644d0cec2d8275d202fd3d871673b7787dd258c 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/mempool.h>
 #include <linux/memory.h>
+#include <linux/cpu.h>
 #include <linux/timer.h>
 #include <linux/io.h>
 #include <linux/iova.h>
@@ -390,6 +391,7 @@ struct dmar_domain {
                                         * domain ids are 16 bit wide according
                                         * to VT-d spec, section 9.3 */
 
+       bool has_iotlb_device;
        struct list_head devices;       /* all devices' list */
        struct iova_domain iovad;       /* iova's that belong to this domain */
 
@@ -456,27 +458,32 @@ static LIST_HEAD(dmar_rmrr_units);
 
 static void flush_unmaps_timeout(unsigned long data);
 
-static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
+struct deferred_flush_entry {
+       unsigned long iova_pfn;
+       unsigned long nrpages;
+       struct dmar_domain *domain;
+       struct page *freelist;
+};
 
 #define HIGH_WATER_MARK 250
-struct deferred_flush_tables {
+struct deferred_flush_table {
        int next;
-       struct iova *iova[HIGH_WATER_MARK];
-       struct dmar_domain *domain[HIGH_WATER_MARK];
-       struct page *freelist[HIGH_WATER_MARK];
+       struct deferred_flush_entry entries[HIGH_WATER_MARK];
+};
+
+struct deferred_flush_data {
+       spinlock_t lock;
+       int timer_on;
+       struct timer_list timer;
+       long size;
+       struct deferred_flush_table *tables;
 };
 
-static struct deferred_flush_tables *deferred_flush;
+DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
 
 /* bitmap for indexing intel_iommus */
 static int g_num_of_iommus;
 
-static DEFINE_SPINLOCK(async_umap_flush_lock);
-static LIST_HEAD(unmaps_to_do);
-
-static int timer_on;
-static long list_size;
-
 static void domain_exit(struct dmar_domain *domain);
 static void domain_remove_dev_info(struct dmar_domain *domain);
 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
@@ -1458,10 +1465,35 @@ iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
        return NULL;
 }
 
+static void domain_update_iotlb(struct dmar_domain *domain)
+{
+       struct device_domain_info *info;
+       bool has_iotlb_device = false;
+
+       assert_spin_locked(&device_domain_lock);
+
+       list_for_each_entry(info, &domain->devices, link) {
+               struct pci_dev *pdev;
+
+               if (!info->dev || !dev_is_pci(info->dev))
+                       continue;
+
+               pdev = to_pci_dev(info->dev);
+               if (pdev->ats_enabled) {
+                       has_iotlb_device = true;
+                       break;
+               }
+       }
+
+       domain->has_iotlb_device = has_iotlb_device;
+}
+
 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
 {
        struct pci_dev *pdev;
 
+       assert_spin_locked(&device_domain_lock);
+
        if (!info || !dev_is_pci(info->dev))
                return;
 
@@ -1481,6 +1513,7 @@ static void iommu_enable_dev_iotlb(struct device_domain_info *info)
 #endif
        if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
                info->ats_enabled = 1;
+               domain_update_iotlb(info->domain);
                info->ats_qdep = pci_ats_queue_depth(pdev);
        }
 }
@@ -1489,6 +1522,8 @@ static void iommu_disable_dev_iotlb(struct device_domain_info *info)
 {
        struct pci_dev *pdev;
 
+       assert_spin_locked(&device_domain_lock);
+
        if (!dev_is_pci(info->dev))
                return;
 
@@ -1497,6 +1532,7 @@ static void iommu_disable_dev_iotlb(struct device_domain_info *info)
        if (info->ats_enabled) {
                pci_disable_ats(pdev);
                info->ats_enabled = 0;
+               domain_update_iotlb(info->domain);
        }
 #ifdef CONFIG_INTEL_IOMMU_SVM
        if (info->pri_enabled) {
@@ -1517,6 +1553,9 @@ static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
        unsigned long flags;
        struct device_domain_info *info;
 
+       if (!domain->has_iotlb_device)
+               return;
+
        spin_lock_irqsave(&device_domain_lock, flags);
        list_for_each_entry(info, &domain->devices, link) {
                if (!info->ats_enabled)
@@ -1734,6 +1773,7 @@ static struct dmar_domain *alloc_domain(int flags)
        memset(domain, 0, sizeof(*domain));
        domain->nid = -1;
        domain->flags = flags;
+       domain->has_iotlb_device = false;
        INIT_LIST_HEAD(&domain->devices);
 
        return domain;
@@ -1918,8 +1958,12 @@ static void domain_exit(struct dmar_domain *domain)
                return;
 
        /* Flush any lazy unmaps that may reference this domain */
-       if (!intel_iommu_strict)
-               flush_unmaps_timeout(0);
+       if (!intel_iommu_strict) {
+               int cpu;
+
+               for_each_possible_cpu(cpu)
+                       flush_unmaps_timeout(cpu);
+       }
 
        /* Remove associated devices and clear attached or cached domains */
        rcu_read_lock();
@@ -3077,7 +3121,7 @@ static int __init init_dmars(void)
        bool copied_tables = false;
        struct device *dev;
        struct intel_iommu *iommu;
-       int i, ret;
+       int i, ret, cpu;
 
        /*
         * for each drhd
@@ -3110,11 +3154,20 @@ static int __init init_dmars(void)
                goto error;
        }
 
-       deferred_flush = kzalloc(g_num_of_iommus *
-               sizeof(struct deferred_flush_tables), GFP_KERNEL);
-       if (!deferred_flush) {
-               ret = -ENOMEM;
-               goto free_g_iommus;
+       for_each_possible_cpu(cpu) {
+               struct deferred_flush_data *dfd = per_cpu_ptr(&deferred_flush,
+                                                             cpu);
+
+               dfd->tables = kzalloc(g_num_of_iommus *
+                                     sizeof(struct deferred_flush_table),
+                                     GFP_KERNEL);
+               if (!dfd->tables) {
+                       ret = -ENOMEM;
+                       goto free_g_iommus;
+               }
+
+               spin_lock_init(&dfd->lock);
+               setup_timer(&dfd->timer, flush_unmaps_timeout, cpu);
        }
 
        for_each_active_iommu(iommu, drhd) {
@@ -3291,19 +3344,20 @@ free_iommu:
                disable_dmar_iommu(iommu);
                free_dmar_iommu(iommu);
        }
-       kfree(deferred_flush);
 free_g_iommus:
+       for_each_possible_cpu(cpu)
+               kfree(per_cpu_ptr(&deferred_flush, cpu)->tables);
        kfree(g_iommus);
 error:
        return ret;
 }
 
 /* This takes a number of _MM_ pages, not VTD pages */
-static struct iova *intel_alloc_iova(struct device *dev,
+static unsigned long intel_alloc_iova(struct device *dev,
                                     struct dmar_domain *domain,
                                     unsigned long nrpages, uint64_t dma_mask)
 {
-       struct iova *iova = NULL;
+       unsigned long iova_pfn = 0;
 
        /* Restrict dma_mask to the width that the iommu can handle */
        dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
@@ -3316,19 +3370,19 @@ static struct iova *intel_alloc_iova(struct device *dev,
                 * DMA_BIT_MASK(32) and if that fails then try allocating
                 * from higher range
                 */
-               iova = alloc_iova(&domain->iovad, nrpages,
-                                 IOVA_PFN(DMA_BIT_MASK(32)), 1);
-               if (iova)
-                       return iova;
+               iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
+                                          IOVA_PFN(DMA_BIT_MASK(32)));
+               if (iova_pfn)
+                       return iova_pfn;
        }
-       iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
-       if (unlikely(!iova)) {
+       iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, IOVA_PFN(dma_mask));
+       if (unlikely(!iova_pfn)) {
                pr_err("Allocating %ld-page iova for %s failed",
                       nrpages, dev_name(dev));
-               return NULL;
+               return 0;
        }
 
-       return iova;
+       return iova_pfn;
 }
 
 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
@@ -3426,7 +3480,7 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
 {
        struct dmar_domain *domain;
        phys_addr_t start_paddr;
-       struct iova *iova;
+       unsigned long iova_pfn;
        int prot = 0;
        int ret;
        struct intel_iommu *iommu;
@@ -3444,8 +3498,8 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
        iommu = domain_get_iommu(domain);
        size = aligned_nrpages(paddr, size);
 
-       iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
-       if (!iova)
+       iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
+       if (!iova_pfn)
                goto error;
 
        /*
@@ -3463,7 +3517,7 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
         * might have two guest_addr mapping to the same host paddr, but this
         * is not a big problem
         */
-       ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
+       ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
                                 mm_to_dma_pfn(paddr_pfn), size, prot);
        if (ret)
                goto error;
@@ -3471,18 +3525,18 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
        /* it's a non-present to present mapping. Only flush if caching mode */
        if (cap_caching_mode(iommu->cap))
                iommu_flush_iotlb_psi(iommu, domain,
-                                     mm_to_dma_pfn(iova->pfn_lo),
+                                     mm_to_dma_pfn(iova_pfn),
                                      size, 0, 1);
        else
                iommu_flush_write_buffer(iommu);
 
-       start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
+       start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
        start_paddr += paddr & ~PAGE_MASK;
        return start_paddr;
 
 error:
-       if (iova)
-               __free_iova(&domain->iovad, iova);
+       if (iova_pfn)
+               free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
        pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
                dev_name(dev), size, (unsigned long long)paddr, dir);
        return 0;
@@ -3497,91 +3551,120 @@ static dma_addr_t intel_map_page(struct device *dev, struct page *page,
                                  dir, *dev->dma_mask);
 }
 
-static void flush_unmaps(void)
+static void flush_unmaps(struct deferred_flush_data *flush_data)
 {
        int i, j;
 
-       timer_on = 0;
+       flush_data->timer_on = 0;
 
        /* just flush them all */
        for (i = 0; i < g_num_of_iommus; i++) {
                struct intel_iommu *iommu = g_iommus[i];
+               struct deferred_flush_table *flush_table =
+                               &flush_data->tables[i];
                if (!iommu)
                        continue;
 
-               if (!deferred_flush[i].next)
+               if (!flush_table->next)
                        continue;
 
                /* In caching mode, global flushes turn emulation expensive */
                if (!cap_caching_mode(iommu->cap))
                        iommu->flush.flush_iotlb(iommu, 0, 0, 0,
                                         DMA_TLB_GLOBAL_FLUSH);
-               for (j = 0; j < deferred_flush[i].next; j++) {
+               for (j = 0; j < flush_table->next; j++) {
                        unsigned long mask;
-                       struct iova *iova = deferred_flush[i].iova[j];
-                       struct dmar_domain *domain = deferred_flush[i].domain[j];
+                       struct deferred_flush_entry *entry =
+                                               &flush_table->entries[j];
+                       unsigned long iova_pfn = entry->iova_pfn;
+                       unsigned long nrpages = entry->nrpages;
+                       struct dmar_domain *domain = entry->domain;
+                       struct page *freelist = entry->freelist;
 
                        /* On real hardware multiple invalidations are expensive */
                        if (cap_caching_mode(iommu->cap))
                                iommu_flush_iotlb_psi(iommu, domain,
-                                       iova->pfn_lo, iova_size(iova),
-                                       !deferred_flush[i].freelist[j], 0);
+                                       mm_to_dma_pfn(iova_pfn),
+                                       nrpages, !freelist, 0);
                        else {
-                               mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
-                               iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
-                                               (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
+                               mask = ilog2(nrpages);
+                               iommu_flush_dev_iotlb(domain,
+                                               (uint64_t)iova_pfn << PAGE_SHIFT, mask);
                        }
-                       __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
-                       if (deferred_flush[i].freelist[j])
-                               dma_free_pagelist(deferred_flush[i].freelist[j]);
+                       free_iova_fast(&domain->iovad, iova_pfn, nrpages);
+                       if (freelist)
+                               dma_free_pagelist(freelist);
                }
-               deferred_flush[i].next = 0;
+               flush_table->next = 0;
        }
 
-       list_size = 0;
+       flush_data->size = 0;
 }
 
-static void flush_unmaps_timeout(unsigned long data)
+static void flush_unmaps_timeout(unsigned long cpuid)
 {
+       struct deferred_flush_data *flush_data = per_cpu_ptr(&deferred_flush, cpuid);
        unsigned long flags;
 
-       spin_lock_irqsave(&async_umap_flush_lock, flags);
-       flush_unmaps();
-       spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+       spin_lock_irqsave(&flush_data->lock, flags);
+       flush_unmaps(flush_data);
+       spin_unlock_irqrestore(&flush_data->lock, flags);
 }
 
-static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
+static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
+                     unsigned long nrpages, struct page *freelist)
 {
        unsigned long flags;
-       int next, iommu_id;
+       int entry_id, iommu_id;
        struct intel_iommu *iommu;
+       struct deferred_flush_entry *entry;
+       struct deferred_flush_data *flush_data;
+       unsigned int cpuid;
 
-       spin_lock_irqsave(&async_umap_flush_lock, flags);
-       if (list_size == HIGH_WATER_MARK)
-               flush_unmaps();
+       cpuid = get_cpu();
+       flush_data = per_cpu_ptr(&deferred_flush, cpuid);
+
+       /* Flush all CPUs' entries to avoid deferring too much.  If
+        * this becomes a bottleneck, can just flush us, and rely on
+        * flush timer for the rest.
+        */
+       if (flush_data->size == HIGH_WATER_MARK) {
+               int cpu;
+
+               for_each_online_cpu(cpu)
+                       flush_unmaps_timeout(cpu);
+       }
+
+       spin_lock_irqsave(&flush_data->lock, flags);
 
        iommu = domain_get_iommu(dom);
        iommu_id = iommu->seq_id;
 
-       next = deferred_flush[iommu_id].next;
-       deferred_flush[iommu_id].domain[next] = dom;
-       deferred_flush[iommu_id].iova[next] = iova;
-       deferred_flush[iommu_id].freelist[next] = freelist;
-       deferred_flush[iommu_id].next++;
+       entry_id = flush_data->tables[iommu_id].next;
+       ++(flush_data->tables[iommu_id].next);
 
-       if (!timer_on) {
-               mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
-               timer_on = 1;
+       entry = &flush_data->tables[iommu_id].entries[entry_id];
+       entry->domain = dom;
+       entry->iova_pfn = iova_pfn;
+       entry->nrpages = nrpages;
+       entry->freelist = freelist;
+
+       if (!flush_data->timer_on) {
+               mod_timer(&flush_data->timer, jiffies + msecs_to_jiffies(10));
+               flush_data->timer_on = 1;
        }
-       list_size++;
-       spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+       flush_data->size++;
+       spin_unlock_irqrestore(&flush_data->lock, flags);
+
+       put_cpu();
 }
 
-static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
+static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
 {
        struct dmar_domain *domain;
        unsigned long start_pfn, last_pfn;
-       struct iova *iova;
+       unsigned long nrpages;
+       unsigned long iova_pfn;
        struct intel_iommu *iommu;
        struct page *freelist;
 
@@ -3593,13 +3676,11 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
 
        iommu = domain_get_iommu(domain);
 
-       iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
-       if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
-                     (unsigned long long)dev_addr))
-               return;
+       iova_pfn = IOVA_PFN(dev_addr);
 
-       start_pfn = mm_to_dma_pfn(iova->pfn_lo);
-       last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
+       nrpages = aligned_nrpages(dev_addr, size);
+       start_pfn = mm_to_dma_pfn(iova_pfn);
+       last_pfn = start_pfn + nrpages - 1;
 
        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
                 dev_name(dev), start_pfn, last_pfn);
@@ -3608,12 +3689,12 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
 
        if (intel_iommu_strict) {
                iommu_flush_iotlb_psi(iommu, domain, start_pfn,
-                                     last_pfn - start_pfn + 1, !freelist, 0);
+                                     nrpages, !freelist, 0);
                /* free iova */
-               __free_iova(&domain->iovad, iova);
+               free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
                dma_free_pagelist(freelist);
        } else {
-               add_unmap(domain, iova, freelist);
+               add_unmap(domain, iova_pfn, nrpages, freelist);
                /*
                 * queue up the release of the unmap to save the 1/6th of the
                 * cpu used up by the iotlb flush operation...
@@ -3625,7 +3706,7 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
                             size_t size, enum dma_data_direction dir,
                             struct dma_attrs *attrs)
 {
-       intel_unmap(dev, dev_addr);
+       intel_unmap(dev, dev_addr, size);
 }
 
 static void *intel_alloc_coherent(struct device *dev, size_t size,
@@ -3684,7 +3765,7 @@ static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
        size = PAGE_ALIGN(size);
        order = get_order(size);
 
-       intel_unmap(dev, dma_handle);
+       intel_unmap(dev, dma_handle, size);
        if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
                __free_pages(page, order);
 }
@@ -3693,7 +3774,16 @@ static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
                           int nelems, enum dma_data_direction dir,
                           struct dma_attrs *attrs)
 {
-       intel_unmap(dev, sglist[0].dma_address);
+       dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
+       unsigned long nrpages = 0;
+       struct scatterlist *sg;
+       int i;
+
+       for_each_sg(sglist, sg, nelems, i) {
+               nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
+       }
+
+       intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
 }
 
 static int intel_nontranslate_map_sg(struct device *hddev,
@@ -3717,7 +3807,7 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
        struct dmar_domain *domain;
        size_t size = 0;
        int prot = 0;
-       struct iova *iova = NULL;
+       unsigned long iova_pfn;
        int ret;
        struct scatterlist *sg;
        unsigned long start_vpfn;
@@ -3736,9 +3826,9 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
        for_each_sg(sglist, sg, nelems, i)
                size += aligned_nrpages(sg->offset, sg->length);
 
-       iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
+       iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
                                *dev->dma_mask);
-       if (!iova) {
+       if (!iova_pfn) {
                sglist->dma_length = 0;
                return 0;
        }
@@ -3753,13 +3843,13 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
        if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
                prot |= DMA_PTE_WRITE;
 
-       start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
+       start_vpfn = mm_to_dma_pfn(iova_pfn);
 
        ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
        if (unlikely(ret)) {
                dma_pte_free_pagetable(domain, start_vpfn,
                                       start_vpfn + size - 1);
-               __free_iova(&domain->iovad, iova);
+               free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
                return 0;
        }
 
@@ -4498,6 +4588,46 @@ static struct notifier_block intel_iommu_memory_nb = {
        .priority = 0
 };
 
+static void free_all_cpu_cached_iovas(unsigned int cpu)
+{
+       int i;
+
+       for (i = 0; i < g_num_of_iommus; i++) {
+               struct intel_iommu *iommu = g_iommus[i];
+               struct dmar_domain *domain;
+               u16 did;
+
+               if (!iommu)
+                       continue;
+
+               for (did = 0; did < 0xffff; did++) {
+                       domain = get_iommu_domain(iommu, did);
+
+                       if (!domain)
+                               continue;
+                       free_cpu_cached_iovas(cpu, &domain->iovad);
+               }
+       }
+}
+
+static int intel_iommu_cpu_notifier(struct notifier_block *nfb,
+                                   unsigned long action, void *v)
+{
+       unsigned int cpu = (unsigned long)v;
+
+       switch (action) {
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+               free_all_cpu_cached_iovas(cpu);
+               flush_unmaps_timeout(cpu);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block intel_iommu_cpu_nb = {
+       .notifier_call = intel_iommu_cpu_notifier,
+};
 
 static ssize_t intel_iommu_show_version(struct device *dev,
                                        struct device_attribute *attr,
@@ -4631,7 +4761,6 @@ int __init intel_iommu_init(void)
        up_write(&dmar_global_lock);
        pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
 
-       init_timer(&unmap_timer);
 #ifdef CONFIG_SWIOTLB
        swiotlb = 0;
 #endif
@@ -4648,6 +4777,7 @@ int __init intel_iommu_init(void)
        bus_register_notifier(&pci_bus_type, &device_nb);
        if (si_domain && !hw_pass_through)
                register_memory_notifier(&intel_iommu_memory_nb);
+       register_hotcpu_notifier(&intel_iommu_cpu_nb);
 
        intel_iommu_enabled = 1;
 
index fa0adef32bd6d3a4af1b97ee3b1fb22dde6c225f..ba764a0835d3cd881981f0e049956e6f71a62788 100644 (file)
 #include <linux/iova.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/bitops.h>
+
+static bool iova_rcache_insert(struct iova_domain *iovad,
+                              unsigned long pfn,
+                              unsigned long size);
+static unsigned long iova_rcache_get(struct iova_domain *iovad,
+                                    unsigned long size,
+                                    unsigned long limit_pfn);
+static void init_iova_rcaches(struct iova_domain *iovad);
+static void free_iova_rcaches(struct iova_domain *iovad);
 
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
@@ -38,6 +49,7 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
        iovad->granule = granule;
        iovad->start_pfn = start_pfn;
        iovad->dma_32bit_pfn = pfn_32bit;
+       init_iova_rcaches(iovad);
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
 
@@ -291,33 +303,18 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
 }
 EXPORT_SYMBOL_GPL(alloc_iova);
 
-/**
- * find_iova - find's an iova for a given pfn
- * @iovad: - iova domain in question.
- * @pfn: - page frame number
- * This function finds and returns an iova belonging to the
- * given doamin which matches the given pfn.
- */
-struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+static struct iova *
+private_find_iova(struct iova_domain *iovad, unsigned long pfn)
 {
-       unsigned long flags;
-       struct rb_node *node;
+       struct rb_node *node = iovad->rbroot.rb_node;
+
+       assert_spin_locked(&iovad->iova_rbtree_lock);
 
-       /* Take the lock so that no other thread is manipulating the rbtree */
-       spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-       node = iovad->rbroot.rb_node;
        while (node) {
                struct iova *iova = container_of(node, struct iova, node);
 
                /* If pfn falls within iova's range, return iova */
                if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
-                       spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-                       /* We are not holding the lock while this iova
-                        * is referenced by the caller as the same thread
-                        * which called this function also calls __free_iova()
-                        * and it is by design that only one thread can possibly
-                        * reference a particular iova and hence no conflict.
-                        */
                        return iova;
                }
 
@@ -327,9 +324,35 @@ struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
                        node = node->rb_right;
        }
 
-       spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
        return NULL;
 }
+
+static void private_free_iova(struct iova_domain *iovad, struct iova *iova)
+{
+       assert_spin_locked(&iovad->iova_rbtree_lock);
+       __cached_rbnode_delete_update(iovad, iova);
+       rb_erase(&iova->node, &iovad->rbroot);
+       free_iova_mem(iova);
+}
+
+/**
+ * find_iova - finds an iova for a given pfn
+ * @iovad: - iova domain in question.
+ * @pfn: - page frame number
+ * This function finds and returns an iova belonging to the
+ * given doamin which matches the given pfn.
+ */
+struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+       unsigned long flags;
+       struct iova *iova;
+
+       /* Take the lock so that no other thread is manipulating the rbtree */
+       spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+       iova = private_find_iova(iovad, pfn);
+       spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+       return iova;
+}
 EXPORT_SYMBOL_GPL(find_iova);
 
 /**
@@ -344,10 +367,8 @@ __free_iova(struct iova_domain *iovad, struct iova *iova)
        unsigned long flags;
 
        spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-       __cached_rbnode_delete_update(iovad, iova);
-       rb_erase(&iova->node, &iovad->rbroot);
+       private_free_iova(iovad, iova);
        spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-       free_iova_mem(iova);
 }
 EXPORT_SYMBOL_GPL(__free_iova);
 
@@ -369,6 +390,63 @@ free_iova(struct iova_domain *iovad, unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(free_iova);
 
+/**
+ * alloc_iova_fast - allocates an iova from rcache
+ * @iovad: - iova domain in question
+ * @size: - size of page frames to allocate
+ * @limit_pfn: - max limit address
+ * This function tries to satisfy an iova allocation from the rcache,
+ * and falls back to regular allocation on failure.
+*/
+unsigned long
+alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
+               unsigned long limit_pfn)
+{
+       bool flushed_rcache = false;
+       unsigned long iova_pfn;
+       struct iova *new_iova;
+
+       iova_pfn = iova_rcache_get(iovad, size, limit_pfn);
+       if (iova_pfn)
+               return iova_pfn;
+
+retry:
+       new_iova = alloc_iova(iovad, size, limit_pfn, true);
+       if (!new_iova) {
+               unsigned int cpu;
+
+               if (flushed_rcache)
+                       return 0;
+
+               /* Try replenishing IOVAs by flushing rcache. */
+               flushed_rcache = true;
+               for_each_online_cpu(cpu)
+                       free_cpu_cached_iovas(cpu, iovad);
+               goto retry;
+       }
+
+       return new_iova->pfn_lo;
+}
+EXPORT_SYMBOL_GPL(alloc_iova_fast);
+
+/**
+ * free_iova_fast - free iova pfn range into rcache
+ * @iovad: - iova domain in question.
+ * @pfn: - pfn that is allocated previously
+ * @size: - # of pages in range
+ * This functions frees an iova range by trying to put it into the rcache,
+ * falling back to regular iova deallocation via free_iova() if this fails.
+ */
+void
+free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
+{
+       if (iova_rcache_insert(iovad, pfn, size))
+               return;
+
+       free_iova(iovad, pfn);
+}
+EXPORT_SYMBOL_GPL(free_iova_fast);
+
 /**
  * put_iova_domain - destroys the iova doamin
  * @iovad: - iova domain in question.
@@ -379,6 +457,7 @@ void put_iova_domain(struct iova_domain *iovad)
        struct rb_node *node;
        unsigned long flags;
 
+       free_iova_rcaches(iovad);
        spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
        node = rb_first(&iovad->rbroot);
        while (node) {
@@ -550,5 +629,295 @@ error:
        return NULL;
 }
 
+/*
+ * Magazine caches for IOVA ranges.  For an introduction to magazines,
+ * see the USENIX 2001 paper "Magazines and Vmem: Extending the Slab
+ * Allocator to Many CPUs and Arbitrary Resources" by Bonwick and Adams.
+ * For simplicity, we use a static magazine size and don't implement the
+ * dynamic size tuning described in the paper.
+ */
+
+#define IOVA_MAG_SIZE 128
+
+struct iova_magazine {
+       unsigned long size;
+       unsigned long pfns[IOVA_MAG_SIZE];
+};
+
+struct iova_cpu_rcache {
+       spinlock_t lock;
+       struct iova_magazine *loaded;
+       struct iova_magazine *prev;
+};
+
+static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
+{
+       return kzalloc(sizeof(struct iova_magazine), flags);
+}
+
+static void iova_magazine_free(struct iova_magazine *mag)
+{
+       kfree(mag);
+}
+
+static void
+iova_magazine_free_pfns(struct iova_magazine *mag, struct iova_domain *iovad)
+{
+       unsigned long flags;
+       int i;
+
+       if (!mag)
+               return;
+
+       spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+
+       for (i = 0 ; i < mag->size; ++i) {
+               struct iova *iova = private_find_iova(iovad, mag->pfns[i]);
+
+               BUG_ON(!iova);
+               private_free_iova(iovad, iova);
+       }
+
+       spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+
+       mag->size = 0;
+}
+
+static bool iova_magazine_full(struct iova_magazine *mag)
+{
+       return (mag && mag->size == IOVA_MAG_SIZE);
+}
+
+static bool iova_magazine_empty(struct iova_magazine *mag)
+{
+       return (!mag || mag->size == 0);
+}
+
+static unsigned long iova_magazine_pop(struct iova_magazine *mag,
+                                      unsigned long limit_pfn)
+{
+       BUG_ON(iova_magazine_empty(mag));
+
+       if (mag->pfns[mag->size - 1] >= limit_pfn)
+               return 0;
+
+       return mag->pfns[--mag->size];
+}
+
+static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn)
+{
+       BUG_ON(iova_magazine_full(mag));
+
+       mag->pfns[mag->size++] = pfn;
+}
+
+static void init_iova_rcaches(struct iova_domain *iovad)
+{
+       struct iova_cpu_rcache *cpu_rcache;
+       struct iova_rcache *rcache;
+       unsigned int cpu;
+       int i;
+
+       for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+               rcache = &iovad->rcaches[i];
+               spin_lock_init(&rcache->lock);
+               rcache->depot_size = 0;
+               rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache), cache_line_size());
+               if (WARN_ON(!rcache->cpu_rcaches))
+                       continue;
+               for_each_possible_cpu(cpu) {
+                       cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+                       spin_lock_init(&cpu_rcache->lock);
+                       cpu_rcache->loaded = iova_magazine_alloc(GFP_KERNEL);
+                       cpu_rcache->prev = iova_magazine_alloc(GFP_KERNEL);
+               }
+       }
+}
+
+/*
+ * Try inserting IOVA range starting with 'iova_pfn' into 'rcache', and
+ * return true on success.  Can fail if rcache is full and we can't free
+ * space, and free_iova() (our only caller) will then return the IOVA
+ * range to the rbtree instead.
+ */
+static bool __iova_rcache_insert(struct iova_domain *iovad,
+                                struct iova_rcache *rcache,
+                                unsigned long iova_pfn)
+{
+       struct iova_magazine *mag_to_free = NULL;
+       struct iova_cpu_rcache *cpu_rcache;
+       bool can_insert = false;
+       unsigned long flags;
+
+       cpu_rcache = this_cpu_ptr(rcache->cpu_rcaches);
+       spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+       if (!iova_magazine_full(cpu_rcache->loaded)) {
+               can_insert = true;
+       } else if (!iova_magazine_full(cpu_rcache->prev)) {
+               swap(cpu_rcache->prev, cpu_rcache->loaded);
+               can_insert = true;
+       } else {
+               struct iova_magazine *new_mag = iova_magazine_alloc(GFP_ATOMIC);
+
+               if (new_mag) {
+                       spin_lock(&rcache->lock);
+                       if (rcache->depot_size < MAX_GLOBAL_MAGS) {
+                               rcache->depot[rcache->depot_size++] =
+                                               cpu_rcache->loaded;
+                       } else {
+                               mag_to_free = cpu_rcache->loaded;
+                       }
+                       spin_unlock(&rcache->lock);
+
+                       cpu_rcache->loaded = new_mag;
+                       can_insert = true;
+               }
+       }
+
+       if (can_insert)
+               iova_magazine_push(cpu_rcache->loaded, iova_pfn);
+
+       spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+
+       if (mag_to_free) {
+               iova_magazine_free_pfns(mag_to_free, iovad);
+               iova_magazine_free(mag_to_free);
+       }
+
+       return can_insert;
+}
+
+static bool iova_rcache_insert(struct iova_domain *iovad, unsigned long pfn,
+                              unsigned long size)
+{
+       unsigned int log_size = order_base_2(size);
+
+       if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
+               return false;
+
+       return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn);
+}
+
+/*
+ * Caller wants to allocate a new IOVA range from 'rcache'.  If we can
+ * satisfy the request, return a matching non-NULL range and remove
+ * it from the 'rcache'.
+ */
+static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
+                                      unsigned long limit_pfn)
+{
+       struct iova_cpu_rcache *cpu_rcache;
+       unsigned long iova_pfn = 0;
+       bool has_pfn = false;
+       unsigned long flags;
+
+       cpu_rcache = this_cpu_ptr(rcache->cpu_rcaches);
+       spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+       if (!iova_magazine_empty(cpu_rcache->loaded)) {
+               has_pfn = true;
+       } else if (!iova_magazine_empty(cpu_rcache->prev)) {
+               swap(cpu_rcache->prev, cpu_rcache->loaded);
+               has_pfn = true;
+       } else {
+               spin_lock(&rcache->lock);
+               if (rcache->depot_size > 0) {
+                       iova_magazine_free(cpu_rcache->loaded);
+                       cpu_rcache->loaded = rcache->depot[--rcache->depot_size];
+                       has_pfn = true;
+               }
+               spin_unlock(&rcache->lock);
+       }
+
+       if (has_pfn)
+               iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn);
+
+       spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+
+       return iova_pfn;
+}
+
+/*
+ * Try to satisfy IOVA allocation range from rcache.  Fail if requested
+ * size is too big or the DMA limit we are given isn't satisfied by the
+ * top element in the magazine.
+ */
+static unsigned long iova_rcache_get(struct iova_domain *iovad,
+                                    unsigned long size,
+                                    unsigned long limit_pfn)
+{
+       unsigned int log_size = order_base_2(size);
+
+       if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
+               return 0;
+
+       return __iova_rcache_get(&iovad->rcaches[log_size], limit_pfn);
+}
+
+/*
+ * Free a cpu's rcache.
+ */
+static void free_cpu_iova_rcache(unsigned int cpu, struct iova_domain *iovad,
+                                struct iova_rcache *rcache)
+{
+       struct iova_cpu_rcache *cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+       unsigned long flags;
+
+       spin_lock_irqsave(&cpu_rcache->lock, flags);
+
+       iova_magazine_free_pfns(cpu_rcache->loaded, iovad);
+       iova_magazine_free(cpu_rcache->loaded);
+
+       iova_magazine_free_pfns(cpu_rcache->prev, iovad);
+       iova_magazine_free(cpu_rcache->prev);
+
+       spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+}
+
+/*
+ * free rcache data structures.
+ */
+static void free_iova_rcaches(struct iova_domain *iovad)
+{
+       struct iova_rcache *rcache;
+       unsigned long flags;
+       unsigned int cpu;
+       int i, j;
+
+       for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+               rcache = &iovad->rcaches[i];
+               for_each_possible_cpu(cpu)
+                       free_cpu_iova_rcache(cpu, iovad, rcache);
+               spin_lock_irqsave(&rcache->lock, flags);
+               free_percpu(rcache->cpu_rcaches);
+               for (j = 0; j < rcache->depot_size; ++j) {
+                       iova_magazine_free_pfns(rcache->depot[j], iovad);
+                       iova_magazine_free(rcache->depot[j]);
+               }
+               spin_unlock_irqrestore(&rcache->lock, flags);
+       }
+}
+
+/*
+ * free all the IOVA ranges cached by a cpu (used when cpu is unplugged)
+ */
+void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
+{
+       struct iova_cpu_rcache *cpu_rcache;
+       struct iova_rcache *rcache;
+       unsigned long flags;
+       int i;
+
+       for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+               rcache = &iovad->rcaches[i];
+               cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
+               spin_lock_irqsave(&cpu_rcache->lock, flags);
+               iova_magazine_free_pfns(cpu_rcache->loaded, iovad);
+               iova_magazine_free_pfns(cpu_rcache->prev, iovad);
+               spin_unlock_irqrestore(&cpu_rcache->lock, flags);
+       }
+}
+
 MODULE_AUTHOR("Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>");
 MODULE_LICENSE("GPL");
index 92f7177db2ce869a29db8813911c3a8a0c2b86b2..f27bb2c62fca5cf19a74f3f03c64b8cb9036bd19 100644 (file)
 /* iova structure */
 struct iova {
        struct rb_node  node;
-       unsigned long   pfn_hi; /* IOMMU dish out addr hi */
-       unsigned long   pfn_lo; /* IOMMU dish out addr lo */
+       unsigned long   pfn_hi; /* Highest allocated pfn */
+       unsigned long   pfn_lo; /* Lowest allocated pfn */
+};
+
+struct iova_magazine;
+struct iova_cpu_rcache;
+
+#define IOVA_RANGE_CACHE_MAX_SIZE 6    /* log of max cached IOVA range size (in pages) */
+#define MAX_GLOBAL_MAGS 32     /* magazines per bin */
+
+struct iova_rcache {
+       spinlock_t lock;
+       unsigned long depot_size;
+       struct iova_magazine *depot[MAX_GLOBAL_MAGS];
+       struct iova_cpu_rcache __percpu *cpu_rcaches;
 };
 
 /* holds all the iova translations for a domain */
@@ -31,6 +44,7 @@ struct iova_domain {
        unsigned long   granule;        /* pfn granularity for this domain */
        unsigned long   start_pfn;      /* Lower limit for this domain */
        unsigned long   dma_32bit_pfn;
+       struct iova_rcache rcaches[IOVA_RANGE_CACHE_MAX_SIZE];  /* IOVA range caches */
 };
 
 static inline unsigned long iova_size(struct iova *iova)
@@ -78,6 +92,10 @@ void __free_iova(struct iova_domain *iovad, struct iova *iova);
 struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
        unsigned long limit_pfn,
        bool size_aligned);
+void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
+                   unsigned long size);
+unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
+                             unsigned long limit_pfn);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
        unsigned long pfn_hi);
 void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
@@ -87,5 +105,6 @@ struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
 struct iova *split_and_remove_iova(struct iova_domain *iovad,
        struct iova *iova, unsigned long pfn_lo, unsigned long pfn_hi);
+void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 
 #endif
This page took 0.043305 seconds and 5 git commands to generate.