(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
/* default scan 8*512 pte (or vmas) every 30 second */
-static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
static unsigned int khugepaged_full_scans;
static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
* it would have happened if the vma was large enough during page
* fault.
*/
-static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+static unsigned int khugepaged_max_ptes_none __read_mostly;
static int khugepaged(void *none);
static int khugepaged_slab_init(void);
if (recommended_min > min_free_kbytes) {
if (user_min_free_kbytes >= 0)
- pr_info("raising min_free_kbytes from %d to %lu "
- "to help transparent hugepage allocations\n",
+ pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
min_free_kbytes, recommended_min);
min_free_kbytes = recommended_min;
return -EINVAL;
}
+ khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
+ khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
+ /*
+ * hugepages can't be allocated by the buddy allocator
+ */
+ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
+ /*
+ * we use page->mapping and page->index in second tail page
+ * as list_head: assuming THP order >= 2
+ */
+ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
+
err = hugepage_init_sysfs(&hugepage_kobj);
if (err)
goto err_sysfs;
* we use page->mapping and page->indexlru in second tail page
* as list_head: assuming THP order >= 2
*/
- BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
INIT_LIST_HEAD(page_deferred_list(page));
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!page_count(page), page);
- atomic_add(HPAGE_PMD_NR - 1, &page->_count);
+ page_ref_add(page, HPAGE_PMD_NR - 1);
write = pmd_write(*pmd);
young = pmd_young(*pmd);
dirty = pmd_dirty(*pmd);
}
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long address)
+ unsigned long address, bool freeze)
{
spinlock_t *ptl;
struct mm_struct *mm = vma->vm_mm;
- struct page *page = NULL;
unsigned long haddr = address & HPAGE_PMD_MASK;
mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
ptl = pmd_lock(mm, pmd);
if (pmd_trans_huge(*pmd)) {
- page = pmd_page(*pmd);
+ struct page *page = pmd_page(*pmd);
if (PageMlocked(page))
- get_page(page);
- else
- page = NULL;
+ clear_page_mlock(page);
} else if (!pmd_devmap(*pmd))
goto out;
- __split_huge_pmd_locked(vma, pmd, haddr, false);
+ __split_huge_pmd_locked(vma, pmd, haddr, freeze);
out:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
- if (page) {
- lock_page(page);
- munlock_vma_page(page);
- unlock_page(page);
- put_page(page);
- }
}
-static void split_huge_pmd_address(struct vm_area_struct *vma,
- unsigned long address)
+void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
+ bool freeze, struct page *page)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
-
pgd = pgd_offset(vma->vm_mm, address);
if (!pgd_present(*pgd))
return;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)))
return;
+
+ /*
+ * If caller asks to setup a migration entries, we need a page to check
+ * pmd against. Otherwise we can end up replacing wrong page.
+ */
+ VM_BUG_ON(freeze && !page);
+ if (page && page != pmd_page(*pmd))
+ return;
+
/*
* Caller holds the mmap_sem write mode, so a huge pmd cannot
* materialize from under us.
*/
- split_huge_pmd(vma, pmd, address);
+ __split_huge_pmd(vma, pmd, address, freeze);
}
void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (start & ~HPAGE_PMD_MASK &&
(start & HPAGE_PMD_MASK) >= vma->vm_start &&
(start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_pmd_address(vma, start);
+ split_huge_pmd_address(vma, start, false, NULL);
/*
* If the new end address isn't hpage aligned and it could
if (end & ~HPAGE_PMD_MASK &&
(end & HPAGE_PMD_MASK) >= vma->vm_start &&
(end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_pmd_address(vma, end);
+ split_huge_pmd_address(vma, end, false, NULL);
/*
* If we're also updating the vma->vm_next->vm_start, if the new
if (nstart & ~HPAGE_PMD_MASK &&
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
- split_huge_pmd_address(next, nstart);
- }
-}
-
-static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
- unsigned long address)
-{
- unsigned long haddr = address & HPAGE_PMD_MASK;
- spinlock_t *ptl;
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- int i, nr = HPAGE_PMD_NR;
-
- /* Skip pages which doesn't belong to the VMA */
- if (address < vma->vm_start) {
- int off = (vma->vm_start - address) >> PAGE_SHIFT;
- page += off;
- nr -= off;
- address = vma->vm_start;
- }
-
- pgd = pgd_offset(vma->vm_mm, address);
- if (!pgd_present(*pgd))
- return;
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return;
- pmd = pmd_offset(pud, address);
- ptl = pmd_lock(vma->vm_mm, pmd);
- if (!pmd_present(*pmd)) {
- spin_unlock(ptl);
- return;
- }
- if (pmd_trans_huge(*pmd)) {
- if (page == pmd_page(*pmd))
- __split_huge_pmd_locked(vma, pmd, haddr, true);
- spin_unlock(ptl);
- return;
- }
- spin_unlock(ptl);
-
- pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
- for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
- pte_t entry, swp_pte;
- swp_entry_t swp_entry;
-
- /*
- * We've just crossed page table boundary: need to map next one.
- * It can happen if THP was mremaped to non PMD-aligned address.
- */
- if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
- pte_unmap_unlock(pte - 1, ptl);
- pmd = mm_find_pmd(vma->vm_mm, address);
- if (!pmd)
- return;
- pte = pte_offset_map_lock(vma->vm_mm, pmd,
- address, &ptl);
- }
-
- if (!pte_present(*pte))
- continue;
- if (page_to_pfn(page) != pte_pfn(*pte))
- continue;
- flush_cache_page(vma, address, page_to_pfn(page));
- entry = ptep_clear_flush(vma, address, pte);
- if (pte_dirty(entry))
- SetPageDirty(page);
- swp_entry = make_migration_entry(page, pte_write(entry));
- swp_pte = swp_entry_to_pte(swp_entry);
- if (pte_soft_dirty(entry))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- set_pte_at(vma->vm_mm, address, pte, swp_pte);
- page_remove_rmap(page, false);
- put_page(page);
+ split_huge_pmd_address(next, nstart, false, NULL);
}
- pte_unmap_unlock(pte - 1, ptl);
}
-static void freeze_page(struct anon_vma *anon_vma, struct page *page)
+static void freeze_page(struct page *page)
{
- struct anon_vma_chain *avc;
- pgoff_t pgoff = page_to_pgoff(page);
+ enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK |
+ TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED;
+ int i, ret;
VM_BUG_ON_PAGE(!PageHead(page), page);
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
- pgoff + HPAGE_PMD_NR - 1) {
- unsigned long address = __vma_address(page, avc->vma);
-
- mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- freeze_page_vma(avc->vma, page, address);
- mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- }
-}
-
-static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
- unsigned long address)
-{
- spinlock_t *ptl;
- pmd_t *pmd;
- pte_t *pte, entry;
- swp_entry_t swp_entry;
- unsigned long haddr = address & HPAGE_PMD_MASK;
- int i, nr = HPAGE_PMD_NR;
-
- /* Skip pages which doesn't belong to the VMA */
- if (address < vma->vm_start) {
- int off = (vma->vm_start - address) >> PAGE_SHIFT;
- page += off;
- nr -= off;
- address = vma->vm_start;
- }
-
- pmd = mm_find_pmd(vma->vm_mm, address);
- if (!pmd)
- return;
-
- pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
- for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
- /*
- * We've just crossed page table boundary: need to map next one.
- * It can happen if THP was mremaped to non-PMD aligned address.
- */
- if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
- pte_unmap_unlock(pte - 1, ptl);
- pmd = mm_find_pmd(vma->vm_mm, address);
- if (!pmd)
- return;
- pte = pte_offset_map_lock(vma->vm_mm, pmd,
- address, &ptl);
- }
-
- if (!is_swap_pte(*pte))
- continue;
-
- swp_entry = pte_to_swp_entry(*pte);
- if (!is_migration_entry(swp_entry))
- continue;
- if (migration_entry_to_page(swp_entry) != page)
- continue;
-
- get_page(page);
- page_add_anon_rmap(page, vma, address, false);
-
- entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
- if (PageDirty(page))
- entry = pte_mkdirty(entry);
- if (is_write_migration_entry(swp_entry))
- entry = maybe_mkwrite(entry, vma);
-
- flush_dcache_page(page);
- set_pte_at(vma->vm_mm, address, pte, entry);
+ /* We only need TTU_SPLIT_HUGE_PMD once */
+ ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD);
+ for (i = 1; !ret && i < HPAGE_PMD_NR; i++) {
+ /* Cut short if the page is unmapped */
+ if (page_count(page) == 1)
+ return;
- /* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, address, pte);
+ ret = try_to_unmap(page + i, ttu_flags);
}
- pte_unmap_unlock(pte - 1, ptl);
+ VM_BUG_ON(ret);
}
-static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
+static void unfreeze_page(struct page *page)
{
- struct anon_vma_chain *avc;
- pgoff_t pgoff = page_to_pgoff(page);
-
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
- pgoff, pgoff + HPAGE_PMD_NR - 1) {
- unsigned long address = __vma_address(page, avc->vma);
+ int i;
- mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- unfreeze_page_vma(avc->vma, page, address);
- mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- }
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ remove_migration_ptes(page + i, page + i, true);
}
static void __split_huge_page_tail(struct page *head, int tail,
struct page *page_tail = head + tail;
VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
- VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
+ VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
/*
* tail_page->_count is zero and not changing from under us. But
* atomic_set() here would be safe on all archs (and not only on x86),
* it's safer to use atomic_inc().
*/
- atomic_inc(&page_tail->_count);
+ page_ref_inc(page_tail);
page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (head->flags &
ClearPageCompound(head);
spin_unlock_irq(&zone->lru_lock);
- unfreeze_page(page_anon_vma(head), head);
+ unfreeze_page(head);
for (i = 0; i < HPAGE_PMD_NR; i++) {
struct page *subpage = head + i;
}
mlocked = PageMlocked(page);
- freeze_page(anon_vma, head);
+ freeze_page(head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
/* Make sure the page is not on per-CPU pagevec as it takes pin */
BUG();
} else {
spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
- unfreeze_page(anon_vma, head);
+ unfreeze_page(head);
ret = -EBUSY;
}