mei: trace pci configuration space io
[deliverable/linux.git] / mm / huge_memory.c
index 913559388fda3640387a237670a334b7aa2074a5..fd3a07b3e6f4e086b2111c312a78512bed927f9b 100644 (file)
 #include <linux/swap.h>
 #include <linux/shrinker.h>
 #include <linux/mm_inline.h>
+#include <linux/swapops.h>
 #include <linux/dax.h>
 #include <linux/kthread.h>
 #include <linux/khugepaged.h>
 #include <linux/freezer.h>
+#include <linux/pfn_t.h>
 #include <linux/mman.h>
+#include <linux/memremap.h>
 #include <linux/pagemap.h>
+#include <linux/debugfs.h>
 #include <linux/migrate.h>
 #include <linux/hashtable.h>
 #include <linux/userfaultfd_k.h>
@@ -134,6 +138,10 @@ static struct khugepaged_scan khugepaged_scan = {
        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
 };
 
+static DEFINE_SPINLOCK(split_queue_lock);
+static LIST_HEAD(split_queue);
+static unsigned long split_queue_len;
+static struct shrinker deferred_split_shrinker;
 
 static void set_recommended_min_free_kbytes(void)
 {
@@ -666,6 +674,9 @@ static int __init hugepage_init(void)
        err = register_shrinker(&huge_zero_page_shrinker);
        if (err)
                goto err_hzp_shrinker;
+       err = register_shrinker(&deferred_split_shrinker);
+       if (err)
+               goto err_split_shrinker;
 
        /*
         * By default disable transparent hugepages on smaller systems,
@@ -683,6 +694,8 @@ static int __init hugepage_init(void)
 
        return 0;
 err_khugepaged:
+       unregister_shrinker(&deferred_split_shrinker);
+err_split_shrinker:
        unregister_shrinker(&huge_zero_page_shrinker);
 err_hzp_shrinker:
        khugepaged_slab_exit();
@@ -739,6 +752,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
        return entry;
 }
 
+static inline struct list_head *page_deferred_list(struct page *page)
+{
+       /*
+        * ->lru in the tail pages is occupied by compound_head.
+        * Let's use ->mapping + ->index in the second tail page as list_head.
+        */
+       return (struct list_head *)&page[2].mapping;
+}
+
+void prep_transhuge_page(struct page *page)
+{
+       /*
+        * we use page->mapping and page->indexlru in second tail page
+        * as list_head: assuming THP order >= 2
+        */
+       BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
+
+       INIT_LIST_HEAD(page_deferred_list(page));
+       set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+}
+
 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address, pmd_t *pmd,
@@ -843,8 +877,6 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
        if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
                return VM_FAULT_FALLBACK;
-       if (vma->vm_flags & VM_LOCKED)
-               return VM_FAULT_FALLBACK;
        if (unlikely(anon_vma_prepare(vma)))
                return VM_FAULT_OOM;
        if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
@@ -895,32 +927,33 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
        }
+       prep_transhuge_page(page);
        return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
                                            flags);
 }
 
 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
-               pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
+               pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write)
 {
        struct mm_struct *mm = vma->vm_mm;
        pmd_t entry;
        spinlock_t *ptl;
 
        ptl = pmd_lock(mm, pmd);
-       if (pmd_none(*pmd)) {
-               entry = pmd_mkhuge(pfn_pmd(pfn, prot));
-               if (write) {
-                       entry = pmd_mkyoung(pmd_mkdirty(entry));
-                       entry = maybe_pmd_mkwrite(entry, vma);
-               }
-               set_pmd_at(mm, addr, pmd, entry);
-               update_mmu_cache_pmd(vma, addr, pmd);
-       }
+       entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
+       if (pfn_t_devmap(pfn))
+               entry = pmd_mkdevmap(entry);
+       if (write) {
+               entry = pmd_mkyoung(pmd_mkdirty(entry));
+               entry = maybe_pmd_mkwrite(entry, vma);
+       }
+       set_pmd_at(mm, addr, pmd, entry);
+       update_mmu_cache_pmd(vma, addr, pmd);
        spin_unlock(ptl);
 }
 
 int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
-                       pmd_t *pmd, unsigned long pfn, bool write)
+                       pmd_t *pmd, pfn_t pfn, bool write)
 {
        pgprot_t pgprot = vma->vm_page_prot;
        /*
@@ -932,7 +965,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
-       BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+       BUG_ON(!pfn_t_devmap(pfn));
 
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;
@@ -942,6 +975,63 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
        return VM_FAULT_NOPAGE;
 }
 
+static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+               pmd_t *pmd)
+{
+       pmd_t _pmd;
+
+       /*
+        * We should set the dirty bit only for FOLL_WRITE but for now
+        * the dirty bit in the pmd is meaningless.  And if the dirty
+        * bit will become meaningful and we'll only set it with
+        * FOLL_WRITE, an atomic set_bit will be required on the pmd to
+        * set the young bit, instead of the current set_pmd_at.
+        */
+       _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+       if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
+                               pmd, _pmd,  1))
+               update_mmu_cache_pmd(vma, addr, pmd);
+}
+
+struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
+               pmd_t *pmd, int flags)
+{
+       unsigned long pfn = pmd_pfn(*pmd);
+       struct mm_struct *mm = vma->vm_mm;
+       struct dev_pagemap *pgmap;
+       struct page *page;
+
+       assert_spin_locked(pmd_lockptr(mm, pmd));
+
+       if (flags & FOLL_WRITE && !pmd_write(*pmd))
+               return NULL;
+
+       if (pmd_present(*pmd) && pmd_devmap(*pmd))
+               /* pass */;
+       else
+               return NULL;
+
+       if (flags & FOLL_TOUCH)
+               touch_pmd(vma, addr, pmd);
+
+       /*
+        * device mapped pages can only be returned if the
+        * caller will manage the page reference count.
+        */
+       if (!(flags & FOLL_GET))
+               return ERR_PTR(-EEXIST);
+
+       pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
+       pgmap = get_dev_pagemap(pfn, NULL);
+       if (!pgmap)
+               return ERR_PTR(-EFAULT);
+       page = pfn_to_page(pfn);
+       get_page(page);
+       put_dev_pagemap(pgmap);
+
+       return page;
+}
+
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                  struct vm_area_struct *vma)
@@ -963,7 +1053,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
        ret = -EAGAIN;
        pmd = *src_pmd;
-       if (unlikely(!pmd_trans_huge(pmd))) {
+       if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) {
                pte_free(dst_mm, pgtable);
                goto out_unlock;
        }
@@ -986,17 +1076,20 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                goto out_unlock;
        }
 
-       src_page = pmd_page(pmd);
-       VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
-       get_page(src_page);
-       page_dup_rmap(src_page, true);
-       add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+       if (pmd_trans_huge(pmd)) {
+               /* thp accounting separate from pmd_devmap accounting */
+               src_page = pmd_page(pmd);
+               VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+               get_page(src_page);
+               page_dup_rmap(src_page, true);
+               add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+               atomic_long_inc(&dst_mm->nr_ptes);
+               pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+       }
 
        pmdp_set_wrprotect(src_mm, addr, src_pmd);
        pmd = pmd_mkold(pmd_wrprotect(pmd));
-       pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-       atomic_long_inc(&dst_mm->nr_ptes);
 
        ret = 0;
 out_unlock:
@@ -1191,7 +1284,9 @@ alloc:
        } else
                new_page = NULL;
 
-       if (unlikely(!new_page)) {
+       if (likely(new_page)) {
+               prep_transhuge_page(new_page);
+       } else {
                if (!page) {
                        split_huge_pmd(vma, pmd, address);
                        ret |= VM_FAULT_FALLBACK;
@@ -1294,23 +1389,23 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 
        page = pmd_page(*pmd);
        VM_BUG_ON_PAGE(!PageHead(page), page);
-       if (flags & FOLL_TOUCH) {
-               pmd_t _pmd;
+       if (flags & FOLL_TOUCH)
+               touch_pmd(vma, addr, pmd);
+       if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
                /*
-                * We should set the dirty bit only for FOLL_WRITE but
-                * for now the dirty bit in the pmd is meaningless.
-                * And if the dirty bit will become meaningful and
-                * we'll only set it with FOLL_WRITE, an atomic
-                * set_bit will be required on the pmd to set the
-                * young bit, instead of the current set_pmd_at.
+                * We don't mlock() pte-mapped THPs. This way we can avoid
+                * leaking mlocked pages into non-VM_LOCKED VMAs.
+                *
+                * In most cases the pmd is the only mapping of the page as we
+                * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
+                * writable private mappings in populate_vma_page_range().
+                *
+                * The only scenario when we have the page shared here is if we
+                * mlocking read-only mapping shared over fork(). We skip
+                * mlocking such pages.
                 */
-               _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
-               if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
-                                         pmd, _pmd,  1))
-                       update_mmu_cache_pmd(vma, addr, pmd);
-       }
-       if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
-               if (page->mapping && trylock_page(page)) {
+               if (compound_mapcount(page) == 1 && !PageDoubleMap(page) &&
+                               page->mapping && trylock_page(page)) {
                        lru_add_drain();
                        if (page->mapping)
                                mlock_vma_page(page);
@@ -1455,13 +1550,86 @@ out:
        return 0;
 }
 
+int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+               pmd_t *pmd, unsigned long addr, unsigned long next)
+
+{
+       spinlock_t *ptl;
+       pmd_t orig_pmd;
+       struct page *page;
+       struct mm_struct *mm = tlb->mm;
+       int ret = 0;
+
+       ptl = pmd_trans_huge_lock(pmd, vma);
+       if (!ptl)
+               goto out_unlocked;
+
+       orig_pmd = *pmd;
+       if (is_huge_zero_pmd(orig_pmd)) {
+               ret = 1;
+               goto out;
+       }
+
+       page = pmd_page(orig_pmd);
+       /*
+        * If other processes are mapping this page, we couldn't discard
+        * the page unless they all do MADV_FREE so let's skip the page.
+        */
+       if (page_mapcount(page) != 1)
+               goto out;
+
+       if (!trylock_page(page))
+               goto out;
+
+       /*
+        * If user want to discard part-pages of THP, split it so MADV_FREE
+        * will deactivate only them.
+        */
+       if (next - addr != HPAGE_PMD_SIZE) {
+               get_page(page);
+               spin_unlock(ptl);
+               if (split_huge_page(page)) {
+                       put_page(page);
+                       unlock_page(page);
+                       goto out_unlocked;
+               }
+               put_page(page);
+               unlock_page(page);
+               ret = 1;
+               goto out_unlocked;
+       }
+
+       if (PageDirty(page))
+               ClearPageDirty(page);
+       unlock_page(page);
+
+       if (PageActive(page))
+               deactivate_page(page);
+
+       if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
+               orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+                       tlb->fullmm);
+               orig_pmd = pmd_mkold(orig_pmd);
+               orig_pmd = pmd_mkclean(orig_pmd);
+
+               set_pmd_at(mm, addr, pmd, orig_pmd);
+               tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+       }
+       ret = 1;
+out:
+       spin_unlock(ptl);
+out_unlocked:
+       return ret;
+}
+
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pmd_t *pmd, unsigned long addr)
 {
        pmd_t orig_pmd;
        spinlock_t *ptl;
 
-       if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
+       ptl = __pmd_trans_huge_lock(pmd, vma);
+       if (!ptl)
                return 0;
        /*
         * For architectures like ppc64 we look at deposited pgtable
@@ -1524,7 +1692,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
         * We don't have to worry about the ordering of src and dst
         * ptlocks because exclusive mmap_sem prevents deadlock.
         */
-       if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
+       old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
+       if (old_ptl) {
                new_ptl = pmd_lockptr(mm, new_pmd);
                if (new_ptl != old_ptl)
                        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -1558,7 +1727,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        spinlock_t *ptl;
        int ret = 0;
 
-       if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
+       ptl = __pmd_trans_huge_lock(pmd, vma);
+       if (ptl) {
                pmd_t entry;
                bool preserve_write = prot_numa && pmd_write(*pmd);
                ret = 1;
@@ -1594,53 +1764,13 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
  * Note that if it returns true, this routine returns without unlocking page
  * table lock. So callers must unlock it.
  */
-bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
-               spinlock_t **ptl)
-{
-       *ptl = pmd_lock(vma->vm_mm, pmd);
-       if (likely(pmd_trans_huge(*pmd)))
-               return true;
-       spin_unlock(*ptl);
-       return false;
-}
-
-/*
- * This function returns whether a given @page is mapped onto the @address
- * in the virtual space of @mm.
- *
- * When it's true, this function returns *pmd with holding the page table lock
- * and passing it back to the caller via @ptl.
- * If it's false, returns NULL without holding the page table lock.
- */
-pmd_t *page_check_address_pmd(struct page *page,
-                             struct mm_struct *mm,
-                             unsigned long address,
-                             spinlock_t **ptl)
+spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
 {
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-
-       if (address & ~HPAGE_PMD_MASK)
-               return NULL;
-
-       pgd = pgd_offset(mm, address);
-       if (!pgd_present(*pgd))
-               return NULL;
-       pud = pud_offset(pgd, address);
-       if (!pud_present(*pud))
-               return NULL;
-       pmd = pmd_offset(pud, address);
-
-       *ptl = pmd_lock(mm, pmd);
-       if (!pmd_present(*pmd))
-               goto unlock;
-       if (pmd_page(*pmd) != page)
-               goto unlock;
-       if (pmd_trans_huge(*pmd))
-               return pmd;
-unlock:
-       spin_unlock(*ptl);
+       spinlock_t *ptl;
+       ptl = pmd_lock(vma->vm_mm, pmd);
+       if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
+               return ptl;
+       spin_unlock(ptl);
        return NULL;
 }
 
@@ -1942,7 +2072,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
        if (likely(writable)) {
                if (likely(referenced)) {
                        result = SCAN_SUCCEED;
-                       trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+                       trace_mm_collapse_huge_page_isolate(page, none_or_zero,
                                                            referenced, writable, result);
                        return 1;
                }
@@ -1952,7 +2082,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
 out:
        release_pte_pages(pte, _pte);
-       trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+       trace_mm_collapse_huge_page_isolate(page, none_or_zero,
                                            referenced, writable, result);
        return 0;
 }
@@ -2108,6 +2238,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
                return NULL;
        }
 
+       prep_transhuge_page(*hpage);
        count_vm_event(THP_COLLAPSE_ALLOC);
        return *hpage;
 }
@@ -2119,8 +2250,12 @@ static int khugepaged_find_target_node(void)
 
 static inline struct page *alloc_hugepage(int defrag)
 {
-       return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
-                          HPAGE_PMD_ORDER);
+       struct page *page;
+
+       page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
+       if (page)
+               prep_transhuge_page(page);
+       return page;
 }
 
 static struct page *khugepaged_alloc_hugepage(bool *wait)
@@ -2170,8 +2305,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
        if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
            (vma->vm_flags & VM_NOHUGEPAGE))
                return false;
-       if (vma->vm_flags & VM_LOCKED)
-               return false;
        if (!vma->anon_vma || vma->vm_ops)
                return false;
        if (is_vma_temporary_stack(vma))
@@ -2191,7 +2324,7 @@ static void collapse_huge_page(struct mm_struct *mm,
        pgtable_t pgtable;
        struct page *new_page;
        spinlock_t *pmd_ptl, *pte_ptl;
-       int isolated, result = 0;
+       int isolated = 0, result = 0;
        unsigned long hstart, hend;
        struct mem_cgroup *memcg;
        unsigned long mmun_start;       /* For mmu_notifiers */
@@ -2447,7 +2580,7 @@ out_unmap:
                collapse_huge_page(mm, address, hpage, vma, node);
        }
 out:
-       trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced,
+       trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
                                     none_or_zero, result);
        return ret;
 }
@@ -2701,13 +2834,13 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
        struct page *page;
        pgtable_t pgtable;
        pmd_t _pmd;
-       bool young, write;
+       bool young, write, dirty;
        int i;
 
        VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
        VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
        VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
-       VM_BUG_ON(!pmd_trans_huge(*pmd));
+       VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd));
 
        count_vm_event(THP_SPLIT_PMD);
 
@@ -2725,9 +2858,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
        atomic_add(HPAGE_PMD_NR - 1, &page->_count);
        write = pmd_write(*pmd);
        young = pmd_young(*pmd);
-
-       /* leave pmd empty until pte is filled */
-       pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+       dirty = pmd_dirty(*pmd);
 
        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);
@@ -2745,12 +2876,14 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                        entry = swp_entry_to_pte(swp_entry);
                } else {
                        entry = mk_pte(page + i, vma->vm_page_prot);
-                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                       entry = maybe_mkwrite(entry, vma);
                        if (!write)
                                entry = pte_wrprotect(entry);
                        if (!young)
                                entry = pte_mkold(entry);
                }
+               if (dirty)
+                       SetPageDirty(page + i);
                pte = pte_offset_map(&_pmd, haddr);
                BUG_ON(!pte_none(*pte));
                set_pte_at(mm, haddr, pte, entry);
@@ -2778,7 +2911,36 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
        }
 
        smp_wmb(); /* make pte visible before pmd */
+       /*
+        * Up to this point the pmd is present and huge and userland has the
+        * whole access to the hugepage during the split (which happens in
+        * place). If we overwrite the pmd with the not-huge version pointing
+        * to the pte here (which of course we could if all CPUs were bug
+        * free), userland could trigger a small page size TLB miss on the
+        * small sized TLB while the hugepage TLB entry is still established in
+        * the huge TLB. Some CPU doesn't like that.
+        * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
+        * 383 on page 93. Intel should be safe but is also warns that it's
+        * only safe if the permission and cache attributes of the two entries
+        * loaded in the two TLB is identical (which should be the case here).
+        * But it is generally safer to never allow small and huge TLB entries
+        * for the same virtual address to be loaded simultaneously. So instead
+        * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
+        * current pmd notpresent (atomically because here the pmd_trans_huge
+        * and pmd_trans_splitting must remain set at all times on the pmd
+        * until the split is complete for this pmd), then we flush the SMP TLB
+        * and finally we write the non-huge version of the pmd entry with
+        * pmd_populate.
+        */
+       pmdp_invalidate(vma, haddr, pmd);
        pmd_populate(mm, pmd, pgtable);
+
+       if (freeze) {
+               for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+                       page_remove_rmap(page + i, false);
+                       put_page(page + i);
+               }
+       }
 }
 
 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
@@ -2786,14 +2948,29 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 {
        spinlock_t *ptl;
        struct mm_struct *mm = vma->vm_mm;
+       struct page *page = NULL;
        unsigned long haddr = address & HPAGE_PMD_MASK;
 
        mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
        ptl = pmd_lock(mm, pmd);
-       if (likely(pmd_trans_huge(*pmd)))
-               __split_huge_pmd_locked(vma, pmd, haddr, false);
+       if (pmd_trans_huge(*pmd)) {
+               page = pmd_page(*pmd);
+               if (PageMlocked(page))
+                       get_page(page);
+               else
+                       page = NULL;
+       } else if (!pmd_devmap(*pmd))
+               goto out;
+       __split_huge_pmd_locked(vma, pmd, haddr, false);
+out:
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
+       if (page) {
+               lock_page(page);
+               munlock_vma_page(page);
+               unlock_page(page);
+               put_page(page);
+       }
 }
 
 static void split_huge_pmd_address(struct vm_area_struct *vma,
@@ -2814,7 +2991,7 @@ static void split_huge_pmd_address(struct vm_area_struct *vma,
                return;
 
        pmd = pmd_offset(pud, address);
-       if (!pmd_present(*pmd) || !pmd_trans_huge(*pmd))
+       if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)))
                return;
        /*
         * Caller holds the mmap_sem write mode, so a huge pmd cannot
@@ -2863,3 +3040,538 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
                        split_huge_pmd_address(next, nstart);
        }
 }
+
+static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
+               unsigned long address)
+{
+       unsigned long haddr = address & HPAGE_PMD_MASK;
+       spinlock_t *ptl;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       int i, nr = HPAGE_PMD_NR;
+
+       /* Skip pages which doesn't belong to the VMA */
+       if (address < vma->vm_start) {
+               int off = (vma->vm_start - address) >> PAGE_SHIFT;
+               page += off;
+               nr -= off;
+               address = vma->vm_start;
+       }
+
+       pgd = pgd_offset(vma->vm_mm, address);
+       if (!pgd_present(*pgd))
+               return;
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return;
+       pmd = pmd_offset(pud, address);
+       ptl = pmd_lock(vma->vm_mm, pmd);
+       if (!pmd_present(*pmd)) {
+               spin_unlock(ptl);
+               return;
+       }
+       if (pmd_trans_huge(*pmd)) {
+               if (page == pmd_page(*pmd))
+                       __split_huge_pmd_locked(vma, pmd, haddr, true);
+               spin_unlock(ptl);
+               return;
+       }
+       spin_unlock(ptl);
+
+       pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+       for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
+               pte_t entry, swp_pte;
+               swp_entry_t swp_entry;
+
+               /*
+                * We've just crossed page table boundary: need to map next one.
+                * It can happen if THP was mremaped to non PMD-aligned address.
+                */
+               if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
+                       pte_unmap_unlock(pte - 1, ptl);
+                       pmd = mm_find_pmd(vma->vm_mm, address);
+                       if (!pmd)
+                               return;
+                       pte = pte_offset_map_lock(vma->vm_mm, pmd,
+                                       address, &ptl);
+               }
+
+               if (!pte_present(*pte))
+                       continue;
+               if (page_to_pfn(page) != pte_pfn(*pte))
+                       continue;
+               flush_cache_page(vma, address, page_to_pfn(page));
+               entry = ptep_clear_flush(vma, address, pte);
+               if (pte_dirty(entry))
+                       SetPageDirty(page);
+               swp_entry = make_migration_entry(page, pte_write(entry));
+               swp_pte = swp_entry_to_pte(swp_entry);
+               if (pte_soft_dirty(entry))
+                       swp_pte = pte_swp_mksoft_dirty(swp_pte);
+               set_pte_at(vma->vm_mm, address, pte, swp_pte);
+               page_remove_rmap(page, false);
+               put_page(page);
+       }
+       pte_unmap_unlock(pte - 1, ptl);
+}
+
+static void freeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+       struct anon_vma_chain *avc;
+       pgoff_t pgoff = page_to_pgoff(page);
+
+       VM_BUG_ON_PAGE(!PageHead(page), page);
+
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
+                       pgoff + HPAGE_PMD_NR - 1) {
+               unsigned long address = __vma_address(page, avc->vma);
+
+               mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+                               address, address + HPAGE_PMD_SIZE);
+               freeze_page_vma(avc->vma, page, address);
+               mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+                               address, address + HPAGE_PMD_SIZE);
+       }
+}
+
+static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
+               unsigned long address)
+{
+       spinlock_t *ptl;
+       pmd_t *pmd;
+       pte_t *pte, entry;
+       swp_entry_t swp_entry;
+       unsigned long haddr = address & HPAGE_PMD_MASK;
+       int i, nr = HPAGE_PMD_NR;
+
+       /* Skip pages which doesn't belong to the VMA */
+       if (address < vma->vm_start) {
+               int off = (vma->vm_start - address) >> PAGE_SHIFT;
+               page += off;
+               nr -= off;
+               address = vma->vm_start;
+       }
+
+       pmd = mm_find_pmd(vma->vm_mm, address);
+       if (!pmd)
+               return;
+
+       pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+       for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
+               /*
+                * We've just crossed page table boundary: need to map next one.
+                * It can happen if THP was mremaped to non-PMD aligned address.
+                */
+               if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
+                       pte_unmap_unlock(pte - 1, ptl);
+                       pmd = mm_find_pmd(vma->vm_mm, address);
+                       if (!pmd)
+                               return;
+                       pte = pte_offset_map_lock(vma->vm_mm, pmd,
+                                       address, &ptl);
+               }
+
+               if (!is_swap_pte(*pte))
+                       continue;
+
+               swp_entry = pte_to_swp_entry(*pte);
+               if (!is_migration_entry(swp_entry))
+                       continue;
+               if (migration_entry_to_page(swp_entry) != page)
+                       continue;
+
+               get_page(page);
+               page_add_anon_rmap(page, vma, address, false);
+
+               entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
+               if (PageDirty(page))
+                       entry = pte_mkdirty(entry);
+               if (is_write_migration_entry(swp_entry))
+                       entry = maybe_mkwrite(entry, vma);
+
+               flush_dcache_page(page);
+               set_pte_at(vma->vm_mm, address, pte, entry);
+
+               /* No need to invalidate - it was non-present before */
+               update_mmu_cache(vma, address, pte);
+       }
+       pte_unmap_unlock(pte - 1, ptl);
+}
+
+static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+       struct anon_vma_chain *avc;
+       pgoff_t pgoff = page_to_pgoff(page);
+
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
+                       pgoff, pgoff + HPAGE_PMD_NR - 1) {
+               unsigned long address = __vma_address(page, avc->vma);
+
+               mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+                               address, address + HPAGE_PMD_SIZE);
+               unfreeze_page_vma(avc->vma, page, address);
+               mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+                               address, address + HPAGE_PMD_SIZE);
+       }
+}
+
+static int __split_huge_page_tail(struct page *head, int tail,
+               struct lruvec *lruvec, struct list_head *list)
+{
+       int mapcount;
+       struct page *page_tail = head + tail;
+
+       mapcount = atomic_read(&page_tail->_mapcount) + 1;
+       VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
+
+       /*
+        * tail_page->_count is zero and not changing from under us. But
+        * get_page_unless_zero() may be running from under us on the
+        * tail_page. If we used atomic_set() below instead of atomic_add(), we
+        * would then run atomic_set() concurrently with
+        * get_page_unless_zero(), and atomic_set() is implemented in C not
+        * using locked ops. spin_unlock on x86 sometime uses locked ops
+        * because of PPro errata 66, 92, so unless somebody can guarantee
+        * atomic_set() here would be safe on all archs (and not only on x86),
+        * it's safer to use atomic_add().
+        */
+       atomic_add(mapcount + 1, &page_tail->_count);
+
+
+       page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+       page_tail->flags |= (head->flags &
+                       ((1L << PG_referenced) |
+                        (1L << PG_swapbacked) |
+                        (1L << PG_mlocked) |
+                        (1L << PG_uptodate) |
+                        (1L << PG_active) |
+                        (1L << PG_locked) |
+                        (1L << PG_unevictable) |
+                        (1L << PG_dirty)));
+
+       /*
+        * After clearing PageTail the gup refcount can be released.
+        * Page flags also must be visible before we make the page non-compound.
+        */
+       smp_wmb();
+
+       clear_compound_head(page_tail);
+
+       if (page_is_young(head))
+               set_page_young(page_tail);
+       if (page_is_idle(head))
+               set_page_idle(page_tail);
+
+       /* ->mapping in first tail page is compound_mapcount */
+       VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+                       page_tail);
+       page_tail->mapping = head->mapping;
+
+       page_tail->index = head->index + tail;
+       page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
+       lru_add_page_tail(head, page_tail, lruvec, list);
+
+       return mapcount;
+}
+
+static void __split_huge_page(struct page *page, struct list_head *list)
+{
+       struct page *head = compound_head(page);
+       struct zone *zone = page_zone(head);
+       struct lruvec *lruvec;
+       int i, tail_mapcount;
+
+       /* prevent PageLRU to go away from under us, and freeze lru stats */
+       spin_lock_irq(&zone->lru_lock);
+       lruvec = mem_cgroup_page_lruvec(head, zone);
+
+       /* complete memcg works before add pages to LRU */
+       mem_cgroup_split_huge_fixup(head);
+
+       tail_mapcount = 0;
+       for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
+               tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
+       atomic_sub(tail_mapcount, &head->_count);
+
+       ClearPageCompound(head);
+       spin_unlock_irq(&zone->lru_lock);
+
+       unfreeze_page(page_anon_vma(head), head);
+
+       for (i = 0; i < HPAGE_PMD_NR; i++) {
+               struct page *subpage = head + i;
+               if (subpage == page)
+                       continue;
+               unlock_page(subpage);
+
+               /*
+                * Subpages may be freed if there wasn't any mapping
+                * like if add_to_swap() is running on a lru page that
+                * had its mapping zapped. And freeing these pages
+                * requires taking the lru_lock so we do the put_page
+                * of the tail pages after the split is complete.
+                */
+               put_page(subpage);
+       }
+}
+
+int total_mapcount(struct page *page)
+{
+       int i, ret;
+
+       VM_BUG_ON_PAGE(PageTail(page), page);
+
+       if (likely(!PageCompound(page)))
+               return atomic_read(&page->_mapcount) + 1;
+
+       ret = compound_mapcount(page);
+       if (PageHuge(page))
+               return ret;
+       for (i = 0; i < HPAGE_PMD_NR; i++)
+               ret += atomic_read(&page[i]._mapcount) + 1;
+       if (PageDoubleMap(page))
+               ret -= HPAGE_PMD_NR;
+       return ret;
+}
+
+/*
+ * This function splits huge page into normal pages. @page can point to any
+ * subpage of huge page to split. Split doesn't change the position of @page.
+ *
+ * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
+ * The huge page must be locked.
+ *
+ * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
+ *
+ * Both head page and tail pages will inherit mapping, flags, and so on from
+ * the hugepage.
+ *
+ * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
+ * they are not mapped.
+ *
+ * Returns 0 if the hugepage is split successfully.
+ * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
+ * us.
+ */
+int split_huge_page_to_list(struct page *page, struct list_head *list)
+{
+       struct page *head = compound_head(page);
+       struct anon_vma *anon_vma;
+       int count, mapcount, ret;
+       bool mlocked;
+       unsigned long flags;
+
+       VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
+       VM_BUG_ON_PAGE(!PageAnon(page), page);
+       VM_BUG_ON_PAGE(!PageLocked(page), page);
+       VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+       VM_BUG_ON_PAGE(!PageCompound(page), page);
+
+       /*
+        * The caller does not necessarily hold an mmap_sem that would prevent
+        * the anon_vma disappearing so we first we take a reference to it
+        * and then lock the anon_vma for write. This is similar to
+        * page_lock_anon_vma_read except the write lock is taken to serialise
+        * against parallel split or collapse operations.
+        */
+       anon_vma = page_get_anon_vma(head);
+       if (!anon_vma) {
+               ret = -EBUSY;
+               goto out;
+       }
+       anon_vma_lock_write(anon_vma);
+
+       /*
+        * Racy check if we can split the page, before freeze_page() will
+        * split PMDs
+        */
+       if (total_mapcount(head) != page_count(head) - 1) {
+               ret = -EBUSY;
+               goto out_unlock;
+       }
+
+       mlocked = PageMlocked(page);
+       freeze_page(anon_vma, head);
+       VM_BUG_ON_PAGE(compound_mapcount(head), head);
+
+       /* Make sure the page is not on per-CPU pagevec as it takes pin */
+       if (mlocked)
+               lru_add_drain();
+
+       /* Prevent deferred_split_scan() touching ->_count */
+       spin_lock_irqsave(&split_queue_lock, flags);
+       count = page_count(head);
+       mapcount = total_mapcount(head);
+       if (!mapcount && count == 1) {
+               if (!list_empty(page_deferred_list(head))) {
+                       split_queue_len--;
+                       list_del(page_deferred_list(head));
+               }
+               spin_unlock_irqrestore(&split_queue_lock, flags);
+               __split_huge_page(page, list);
+               ret = 0;
+       } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
+               spin_unlock_irqrestore(&split_queue_lock, flags);
+               pr_alert("total_mapcount: %u, page_count(): %u\n",
+                               mapcount, count);
+               if (PageTail(page))
+                       dump_page(head, NULL);
+               dump_page(page, "total_mapcount(head) > 0");
+               BUG();
+       } else {
+               spin_unlock_irqrestore(&split_queue_lock, flags);
+               unfreeze_page(anon_vma, head);
+               ret = -EBUSY;
+       }
+
+out_unlock:
+       anon_vma_unlock_write(anon_vma);
+       put_anon_vma(anon_vma);
+out:
+       count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
+       return ret;
+}
+
+void free_transhuge_page(struct page *page)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&split_queue_lock, flags);
+       if (!list_empty(page_deferred_list(page))) {
+               split_queue_len--;
+               list_del(page_deferred_list(page));
+       }
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+       free_compound_page(page);
+}
+
+void deferred_split_huge_page(struct page *page)
+{
+       unsigned long flags;
+
+       VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+
+       spin_lock_irqsave(&split_queue_lock, flags);
+       if (list_empty(page_deferred_list(page))) {
+               list_add_tail(page_deferred_list(page), &split_queue);
+               split_queue_len++;
+       }
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+}
+
+static unsigned long deferred_split_count(struct shrinker *shrink,
+               struct shrink_control *sc)
+{
+       /*
+        * Split a page from split_queue will free up at least one page,
+        * at most HPAGE_PMD_NR - 1. We don't track exact number.
+        * Let's use HPAGE_PMD_NR / 2 as ballpark.
+        */
+       return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
+}
+
+static unsigned long deferred_split_scan(struct shrinker *shrink,
+               struct shrink_control *sc)
+{
+       unsigned long flags;
+       LIST_HEAD(list), *pos, *next;
+       struct page *page;
+       int split = 0;
+
+       spin_lock_irqsave(&split_queue_lock, flags);
+       list_splice_init(&split_queue, &list);
+
+       /* Take pin on all head pages to avoid freeing them under us */
+       list_for_each_safe(pos, next, &list) {
+               page = list_entry((void *)pos, struct page, mapping);
+               page = compound_head(page);
+               /* race with put_compound_page() */
+               if (!get_page_unless_zero(page)) {
+                       list_del_init(page_deferred_list(page));
+                       split_queue_len--;
+               }
+       }
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+
+       list_for_each_safe(pos, next, &list) {
+               page = list_entry((void *)pos, struct page, mapping);
+               lock_page(page);
+               /* split_huge_page() removes page from list on success */
+               if (!split_huge_page(page))
+                       split++;
+               unlock_page(page);
+               put_page(page);
+       }
+
+       spin_lock_irqsave(&split_queue_lock, flags);
+       list_splice_tail(&list, &split_queue);
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+
+       return split * HPAGE_PMD_NR / 2;
+}
+
+static struct shrinker deferred_split_shrinker = {
+       .count_objects = deferred_split_count,
+       .scan_objects = deferred_split_scan,
+       .seeks = DEFAULT_SEEKS,
+};
+
+#ifdef CONFIG_DEBUG_FS
+static int split_huge_pages_set(void *data, u64 val)
+{
+       struct zone *zone;
+       struct page *page;
+       unsigned long pfn, max_zone_pfn;
+       unsigned long total = 0, split = 0;
+
+       if (val != 1)
+               return -EINVAL;
+
+       for_each_populated_zone(zone) {
+               max_zone_pfn = zone_end_pfn(zone);
+               for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
+                       if (!pfn_valid(pfn))
+                               continue;
+
+                       page = pfn_to_page(pfn);
+                       if (!get_page_unless_zero(page))
+                               continue;
+
+                       if (zone != page_zone(page))
+                               goto next;
+
+                       if (!PageHead(page) || !PageAnon(page) ||
+                                       PageHuge(page))
+                               goto next;
+
+                       total++;
+                       lock_page(page);
+                       if (!split_huge_page(page))
+                               split++;
+                       unlock_page(page);
+next:
+                       put_page(page);
+               }
+       }
+
+       pr_info("%lu of %lu THP split", split, total);
+
+       return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
+               "%llu\n");
+
+static int __init split_huge_pages_debugfs(void)
+{
+       void *ret;
+
+       ret = debugfs_create_file("split_huge_pages", 0644, NULL, NULL,
+                       &split_huge_pages_fops);
+       if (!ret)
+               pr_warn("Failed to create split_huge_pages in debugfs");
+       return 0;
+}
+late_initcall(split_huge_pages_debugfs);
+#endif
This page took 0.042501 seconds and 5 git commands to generate.