[deliverable/linux.git] / arch / powerpc / mm / hugetlbpage.c

/*
 * PPC64 (POWER4) Huge TLB Page Support for Kernel.
 *
 * Copyright (C) 2003 David Gibson, IBM Corporation.
 *
 * Based on the IA-32 version:
 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
 */

#include <linux/mm.h>
#include <linux/io.h>
#include <linux/hugetlb.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>

#define PAGE_SHIFT_64K	16
#define PAGE_SHIFT_16M	24
#define PAGE_SHIFT_16G	34

#define MAX_NUMBER_GPAGES	1024

/* Tracks the 16G pages after the device tree is scanned and before the
 * huge_boot_pages list is ready.  */
static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
static unsigned nr_gpages;

/* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
 * will choke on pointers to hugepte tables, which is handy for
 * catching screwups early. */

static inline int shift_to_mmu_psize(unsigned int shift)
{
	int psize;

	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize)
		if (mmu_psize_defs[psize].shift == shift)
			return psize;
	return -1;
}

static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
{
	if (mmu_psize_defs[mmu_psize].shift)
		return mmu_psize_defs[mmu_psize].shift;
	BUG();
}

#define hugepd_none(hpd)	((hpd).pd == 0)

static inline pte_t *hugepd_page(hugepd_t hpd)
{
	BUG_ON(!hugepd_ok(hpd));
	return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000);
}

static inline unsigned int hugepd_shift(hugepd_t hpd)
{
	return hpd.pd & HUGEPD_SHIFT_MASK;
}

static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift)
{
	unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
	pte_t *dir = hugepd_page(*hpdp);

	return dir + idx;
}

pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
{
	pgd_t *pg;
	pud_t *pu;
	pmd_t *pm;
	hugepd_t *hpdp = NULL;
	unsigned pdshift = PGDIR_SHIFT;

	if (shift)
		*shift = 0;

	pg = pgdir + pgd_index(ea);
	if (is_hugepd(pg)) {
		hpdp = (hugepd_t *)pg;
	} else if (!pgd_none(*pg)) {
		pdshift = PUD_SHIFT;
		pu = pud_offset(pg, ea);
		if (is_hugepd(pu))
			hpdp = (hugepd_t *)pu;
		else if (!pud_none(*pu)) {
			pdshift = PMD_SHIFT;
			pm = pmd_offset(pu, ea);
			if (is_hugepd(pm))
				hpdp = (hugepd_t *)pm;
			else if (!pmd_none(*pm)) {
				return pte_offset_map(pm, ea);
			}
		}
	}

	if (!hpdp)
		return NULL;

	if (shift)
		*shift = hugepd_shift(*hpdp);
	return hugepte_offset(hpdp, ea, pdshift);
}

pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
{
	return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
}

static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
			   unsigned long address, unsigned pdshift, unsigned pshift)
{
	pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),
				       GFP_KERNEL|__GFP_REPEAT);

	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);

	if (! new)
		return -ENOMEM;

	spin_lock(&mm->page_table_lock);
	if (!hugepd_none(*hpdp))
		kmem_cache_free(PGT_CACHE(pdshift - pshift), new);
	else
		hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift;
	spin_unlock(&mm->page_table_lock);
	return 0;
}

pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
	pgd_t *pg;
	pud_t *pu;
	pmd_t *pm;
	hugepd_t *hpdp = NULL;
	unsigned pshift = __ffs(sz);
	unsigned pdshift = PGDIR_SHIFT;

	addr &= ~(sz-1);

	pg = pgd_offset(mm, addr);
	if (pshift >= PUD_SHIFT) {
		hpdp = (hugepd_t *)pg;
	} else {
		pdshift = PUD_SHIFT;
		pu = pud_alloc(mm, pg, addr);
		if (pshift >= PMD_SHIFT) {
			hpdp = (hugepd_t *)pu;
		} else {
			pdshift = PMD_SHIFT;
			pm = pmd_alloc(mm, pu, addr);
			hpdp = (hugepd_t *)pm;
		}
	}

	if (!hpdp)
		return NULL;

	BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));

	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
		return NULL;

	return hugepte_offset(hpdp, addr, pdshift);
}

/* Build list of addresses of gigantic pages.  This function is used in early
 * boot before the buddy or bootmem allocator is setup.
 */
void add_gpage(unsigned long addr, unsigned long page_size,
	unsigned long number_of_pages)
{
	if (!addr)
		return;
	while (number_of_pages > 0) {
		gpage_freearray[nr_gpages] = addr;
		nr_gpages++;
		number_of_pages--;
		addr += page_size;
	}
}

/* Moves the gigantic page addresses from the temporary list to the
 * huge_boot_pages list.
 */
int alloc_bootmem_huge_page(struct hstate *hstate)
{
	struct huge_bootmem_page *m;
	if (nr_gpages == 0)
		return 0;
	m = phys_to_virt(gpage_freearray[--nr_gpages]);
	gpage_freearray[nr_gpages] = 0;
	list_add(&m->list, &huge_boot_pages);
	m->hstate = hstate;
	return 1;
}

int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
{
	return 0;
}

static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
			      unsigned long start, unsigned long end,
			      unsigned long floor, unsigned long ceiling)
{
	pte_t *hugepte = hugepd_page(*hpdp);
	unsigned shift = hugepd_shift(*hpdp);
	unsigned long pdmask = ~((1UL << pdshift) - 1);

	start &= pdmask;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= pdmask;
		if (! ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	hpdp->pd = 0;
	tlb->need_flush = 1;
	pgtable_free_tlb(tlb, hugepte, pdshift - shift);
}

static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				   unsigned long addr, unsigned long end,
				   unsigned long floor, unsigned long ceiling)
{
	pmd_t *pmd;
	unsigned long next;
	unsigned long start;

	start = addr;
	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
		if (pmd_none(*pmd))
			continue;
		free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
				  addr, next, floor, ceiling);
	} while (pmd++, addr = next, addr != end);

	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	pmd = pmd_offset(pud, start);
	pud_clear(pud);
	pmd_free_tlb(tlb, pmd, start);
}

static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
				   unsigned long addr, unsigned long end,
				   unsigned long floor, unsigned long ceiling)
{
	pud_t *pud;
	unsigned long next;
	unsigned long start;

	start = addr;
	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
		if (!is_hugepd(pud)) {
			if (pud_none_or_clear_bad(pud))
				continue;
			hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
					       ceiling);
		} else {
			free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
					  addr, next, floor, ceiling);
		}
	} while (pud++, addr = next, addr != end);

	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(pgd, start);
	pgd_clear(pgd);
	pud_free_tlb(tlb, pud, start);
}

/*
 * This function frees user-level page tables of a process.
 *
 * Must be called with pagetable lock held.
 */
void hugetlb_free_pgd_range(struct mmu_gather *tlb,
			    unsigned long addr, unsigned long end,
			    unsigned long floor, unsigned long ceiling)
{
	pgd_t *pgd;
	unsigned long next;

	/*
	 * Because there are a number of different possible pagetable
	 * layouts for hugepage ranges, we limit knowledge of how
	 * things should be laid out to the allocation path
	 * (huge_pte_alloc(), above).  Everything else works out the
	 * structure as it goes from information in the hugepd
	 * pointers.  That means that we can't here use the
	 * optimization used in the normal page free_pgd_range(), of
	 * checking whether we're actually covering a large enough
	 * range to have to do anything at the top level of the walk
	 * instead of at the bottom.
	 *
	 * To make sense of this, you should probably go read the big
	 * block comment at the top of the normal free_pgd_range(),
	 * too.
	 */

	pgd = pgd_offset(tlb->mm, addr);
	do {
		next = pgd_addr_end(addr, end);
		if (!is_hugepd(pgd)) {
			if (pgd_none_or_clear_bad(pgd))
				continue;
			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
		} else {
			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
					  addr, next, floor, ceiling);
		}
	} while (pgd++, addr = next, addr != end);
}

void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
		     pte_t *ptep, pte_t pte)
{
	if (pte_present(*ptep)) {
		/* We open-code pte_clear because we need to pass the right
		 * argument to hpte_need_flush (huge / !huge). Might not be
		 * necessary anymore if we make hpte_need_flush() get the
		 * page size from the slices
		 */
		pte_update(mm, addr, ptep, ~0UL, 1);
	}
	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
}

pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
			      pte_t *ptep)
{
	unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
	return __pte(old);
}

struct page *
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
{
	pte_t *ptep;
	struct page *page;
	unsigned shift;
	unsigned long mask;

	ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);

	/* Verify it is a huge page else bail. */
	if (!ptep || !shift)
		return ERR_PTR(-EINVAL);

	mask = (1UL << shift) - 1;
	page = pte_page(*ptep);
	if (page)
		page += (address & mask) / PAGE_SIZE;

	return page;
}

int pmd_huge(pmd_t pmd)
{
	return 0;
}

int pud_huge(pud_t pud)
{
	return 0;
}

struct page *
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
		pmd_t *pmd, int write)
{
	BUG();
	return NULL;
}

static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
		       unsigned long end, int write, struct page **pages, int *nr)
{
	unsigned long mask;
	unsigned long pte_end;
	struct page *head, *page;
	pte_t pte;
	int refs;

	pte_end = (addr + sz) & ~(sz-1);
	if (pte_end < end)
		end = pte_end;

	pte = *ptep;
	mask = _PAGE_PRESENT | _PAGE_USER;
	if (write)
		mask |= _PAGE_RW;

	if ((pte_val(pte) & mask) != mask)
		return 0;

	/* hugepages are never "special" */
	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));

	refs = 0;
	head = pte_page(pte);

	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
	do {
		VM_BUG_ON(compound_head(page) != head);
		pages[*nr] = page;
		(*nr)++;
		page++;
		refs++;
	} while (addr += PAGE_SIZE, addr != end);

	if (!page_cache_add_speculative(head, refs)) {
		*nr -= refs;
		return 0;
	}

	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
		/* Could be optimized better */
		while (*nr) {
			put_page(page);
			(*nr)--;
		}
	}

	return 1;
}

int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
	       unsigned long addr, unsigned long end,
	       int write, struct page **pages, int *nr)
{
	pte_t *ptep;
	unsigned long sz = 1UL << hugepd_shift(*hugepd);

	ptep = hugepte_offset(hugepd, addr, pdshift);
	do {
		if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
			return 0;
	} while (ptep++, addr += sz, addr != end);

	return 1;
}

unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
					unsigned long len, unsigned long pgoff,
					unsigned long flags)
{
	struct hstate *hstate = hstate_file(file);
	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));

	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
}

unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
	unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);

	return 1UL << mmu_psize_to_shift(psize);
}

static int __init add_huge_page_size(unsigned long long size)
{
	int shift = __ffs(size);
	int mmu_psize;

	/* Check that it is a page size supported by the hardware and
	 * that it fits within pagetable and slice limits. */
	if (!is_power_of_2(size)
	    || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
		return -EINVAL;

	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
		return -EINVAL;

#ifdef CONFIG_SPU_FS_64K_LS
	/* Disable support for 64K huge pages when 64K SPU local store
	 * support is enabled as the current implementation conflicts.
	 */
	if (shift == PAGE_SHIFT_64K)
		return -EINVAL;
#endif /* CONFIG_SPU_FS_64K_LS */

	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);

	/* Return if huge page size has already been setup */
	if (size_to_hstate(size))
		return 0;

	hugetlb_add_hstate(shift - PAGE_SHIFT);

	return 0;
}

static int __init hugepage_setup_sz(char *str)
{
	unsigned long long size;

	size = memparse(str, &str);

	if (add_huge_page_size(size) != 0)
		printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);

	return 1;
}
__setup("hugepagesz=", hugepage_setup_sz);

static int __init hugetlbpage_init(void)
{
	int psize;

	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
		return -ENODEV;

	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
		unsigned shift;
		unsigned pdshift;

		if (!mmu_psize_defs[psize].shift)
			continue;

		shift = mmu_psize_to_shift(psize);

		if (add_huge_page_size(1ULL << shift) < 0)
			continue;

		if (shift < PMD_SHIFT)
			pdshift = PMD_SHIFT;
		else if (shift < PUD_SHIFT)
			pdshift = PUD_SHIFT;
		else
			pdshift = PGDIR_SHIFT;

		pgtable_cache_add(pdshift - shift, NULL);
		if (!PGT_CACHE(pdshift - shift))
			panic("hugetlbpage_init(): could not create "
			      "pgtable cache for %d bit pagesize\n", shift);
	}

	/* Set default large page size. Currently, we pick 16M or 1M
	 * depending on what is available
	 */
	if (mmu_psize_defs[MMU_PAGE_16M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;

	return 0;
}

module_init(hugetlbpage_init);
Commit	Line	Data
1da177e4 LT	1	/*
	2	* PPC64 (POWER4) Huge TLB Page Support for Kernel.
	3	*
	4	* Copyright (C) 2003 David Gibson, IBM Corporation.
	5	*
	6	* Based on the IA-32 version:
	7	* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
	8	*/
	9
1da177e4	10	#include <linux/mm.h>
883a3e52	11	#include <linux/io.h>
1da177e4	12	#include <linux/hugetlb.h>
883a3e52	13	#include <asm/pgtable.h>
1da177e4 LT	14	#include <asm/pgalloc.h>
1da177e4 LT	15	#include <asm/tlb.h>
1da177e4	16
91224346 JT	17	#define PAGE_SHIFT_64K 16
	18	#define PAGE_SHIFT_16M 24
	19	#define PAGE_SHIFT_16G 34
4ec161cf	20
ec4b2c0c JT	21	#define MAX_NUMBER_GPAGES 1024
	22
	23	/* Tracks the 16G pages after the device tree is scanned and before the
	24	* huge_boot_pages list is ready. */
	25	static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
	26	static unsigned nr_gpages;
c594adad	27
f10a04c0 DG	28	/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
	29	* will choke on pointers to hugepte tables, which is handy for
	30	* catching screwups early. */
f10a04c0	31
0d9ea754 JT	32	static inline int shift_to_mmu_psize(unsigned int shift)
0d9ea754 JT	33	{
d1837cba DG	34	int psize;
	35
	36	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize)
	37	if (mmu_psize_defs[psize].shift == shift)
	38	return psize;
0d9ea754 JT	39	return -1;
	40	}
	41
	42	static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
	43	{
	44	if (mmu_psize_defs[mmu_psize].shift)
	45	return mmu_psize_defs[mmu_psize].shift;
	46	BUG();
	47	}
	48
a4fe3ce7 DG	49	#define hugepd_none(hpd) ((hpd).pd == 0)
a4fe3ce7 DG	50
f10a04c0 DG	51	static inline pte_t *hugepd_page(hugepd_t hpd)
f10a04c0 DG	52	{
a4fe3ce7 DG	53	BUG_ON(!hugepd_ok(hpd));
	54	return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) \| 0xc000000000000000);
	55	}
	56
	57	static inline unsigned int hugepd_shift(hugepd_t hpd)
	58	{
	59	return hpd.pd & HUGEPD_SHIFT_MASK;
f10a04c0 DG	60	}
f10a04c0 DG	61
a4fe3ce7	62	static inline pte_t hugepte_offset(hugepd_t hpdp, unsigned long addr, unsigned pdshift)
f10a04c0	63	{
a4fe3ce7	64	unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
f10a04c0 DG	65	pte_t dir = hugepd_page(hpdp);
	66
	67	return dir + idx;
	68	}
	69
a4fe3ce7 DG	70	pte_t find_linux_pte_or_hugepte(pgd_t pgdir, unsigned long ea, unsigned *shift)
	71	{
	72	pgd_t *pg;
	73	pud_t *pu;
	74	pmd_t *pm;
	75	hugepd_t *hpdp = NULL;
	76	unsigned pdshift = PGDIR_SHIFT;
	77
	78	if (shift)
	79	*shift = 0;
	80
	81	pg = pgdir + pgd_index(ea);
	82	if (is_hugepd(pg)) {
	83	hpdp = (hugepd_t *)pg;
	84	} else if (!pgd_none(*pg)) {
	85	pdshift = PUD_SHIFT;
	86	pu = pud_offset(pg, ea);
	87	if (is_hugepd(pu))
	88	hpdp = (hugepd_t *)pu;
	89	else if (!pud_none(*pu)) {
	90	pdshift = PMD_SHIFT;
	91	pm = pmd_offset(pu, ea);
	92	if (is_hugepd(pm))
	93	hpdp = (hugepd_t *)pm;
	94	else if (!pmd_none(*pm)) {
	95	return pte_offset_map(pm, ea);
	96	}
	97	}
	98	}
	99
	100	if (!hpdp)
	101	return NULL;
	102
	103	if (shift)
	104	shift = hugepd_shift(hpdp);
	105	return hugepte_offset(hpdp, ea, pdshift);
	106	}
	107
	108	pte_t huge_pte_offset(struct mm_struct mm, unsigned long addr)
	109	{
	110	return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
	111	}
	112
f10a04c0	113	static int __hugepte_alloc(struct mm_struct mm, hugepd_t hpdp,
a4fe3ce7	114	unsigned long address, unsigned pdshift, unsigned pshift)
f10a04c0	115	{
a4fe3ce7	116	pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),
a0668cdc	117	GFP_KERNEL\|__GFP_REPEAT);
f10a04c0	118
a4fe3ce7 DG	119	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
	120	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
	121
f10a04c0 DG	122	if (! new)
	123	return -ENOMEM;
	124
	125	spin_lock(&mm->page_table_lock);
	126	if (!hugepd_none(*hpdp))
a4fe3ce7	127	kmem_cache_free(PGT_CACHE(pdshift - pshift), new);
f10a04c0	128	else
a4fe3ce7	129	hpdp->pd = ((unsigned long)new & ~0x8000000000000000) \| pshift;
f10a04c0 DG	130	spin_unlock(&mm->page_table_lock);
	131	return 0;
	132	}
	133
a4fe3ce7	134	pte_t huge_pte_alloc(struct mm_struct mm, unsigned long addr, unsigned long sz)
0b26425c	135	{
a4fe3ce7 DG	136	pgd_t *pg;
	137	pud_t *pu;
	138	pmd_t *pm;
	139	hugepd_t *hpdp = NULL;
	140	unsigned pshift = __ffs(sz);
	141	unsigned pdshift = PGDIR_SHIFT;
	142
	143	addr &= ~(sz-1);
	144
	145	pg = pgd_offset(mm, addr);
	146	if (pshift >= PUD_SHIFT) {
	147	hpdp = (hugepd_t *)pg;
	148	} else {
	149	pdshift = PUD_SHIFT;
	150	pu = pud_alloc(mm, pg, addr);
	151	if (pshift >= PMD_SHIFT) {
	152	hpdp = (hugepd_t *)pu;
	153	} else {
	154	pdshift = PMD_SHIFT;
	155	pm = pmd_alloc(mm, pu, addr);
	156	hpdp = (hugepd_t *)pm;
	157	}
	158	}
	159
	160	if (!hpdp)
	161	return NULL;
	162
	163	BUG_ON(!hugepd_none(hpdp) && !hugepd_ok(hpdp));
	164
	165	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
	166	return NULL;
	167
	168	return hugepte_offset(hpdp, addr, pdshift);
4ec161cf	169	}
4ec161cf	170
658013e9 JT	171	/* Build list of addresses of gigantic pages. This function is used in early
	172	* boot before the buddy or bootmem allocator is setup.
	173	*/
	174	void add_gpage(unsigned long addr, unsigned long page_size,
	175	unsigned long number_of_pages)
	176	{
	177	if (!addr)
	178	return;
	179	while (number_of_pages > 0) {
	180	gpage_freearray[nr_gpages] = addr;
	181	nr_gpages++;
	182	number_of_pages--;
	183	addr += page_size;
	184	}
	185	}
	186
ec4b2c0c	187	/* Moves the gigantic page addresses from the temporary list to the
0d9ea754 JT	188	* huge_boot_pages list.
	189	*/
	190	int alloc_bootmem_huge_page(struct hstate *hstate)
ec4b2c0c JT	191	{
	192	struct huge_bootmem_page *m;
	193	if (nr_gpages == 0)
	194	return 0;
	195	m = phys_to_virt(gpage_freearray[--nr_gpages]);
	196	gpage_freearray[nr_gpages] = 0;
	197	list_add(&m->list, &huge_boot_pages);
0d9ea754	198	m->hstate = hstate;
ec4b2c0c JT	199	return 1;
	200	}
	201
39dde65c CK	202	int huge_pmd_unshare(struct mm_struct mm, unsigned long addr, pte_t *ptep)
	203	{
	204	return 0;
	205	}
	206
a4fe3ce7 DG	207	static void free_hugepd_range(struct mmu_gather tlb, hugepd_t hpdp, int pdshift,
	208	unsigned long start, unsigned long end,
	209	unsigned long floor, unsigned long ceiling)
f10a04c0 DG	210	{
f10a04c0 DG	211	pte_t hugepte = hugepd_page(hpdp);
a4fe3ce7 DG	212	unsigned shift = hugepd_shift(*hpdp);
	213	unsigned long pdmask = ~((1UL << pdshift) - 1);
	214
	215	start &= pdmask;
	216	if (start < floor)
	217	return;
	218	if (ceiling) {
	219	ceiling &= pdmask;
	220	if (! ceiling)
	221	return;
	222	}
	223	if (end - 1 > ceiling - 1)
	224	return;
f10a04c0 DG	225
	226	hpdp->pd = 0;
	227	tlb->need_flush = 1;
a4fe3ce7	228	pgtable_free_tlb(tlb, hugepte, pdshift - shift);
f10a04c0 DG	229	}
f10a04c0 DG	230
f10a04c0 DG	231	static void hugetlb_free_pmd_range(struct mmu_gather tlb, pud_t pud,
f10a04c0 DG	232	unsigned long addr, unsigned long end,
a4fe3ce7	233	unsigned long floor, unsigned long ceiling)
f10a04c0 DG	234	{
	235	pmd_t *pmd;
	236	unsigned long next;
	237	unsigned long start;
	238
	239	start = addr;
	240	pmd = pmd_offset(pud, addr);
	241	do {
	242	next = pmd_addr_end(addr, end);
	243	if (pmd_none(*pmd))
	244	continue;
a4fe3ce7 DG	245	free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
a4fe3ce7 DG	246	addr, next, floor, ceiling);
f10a04c0 DG	247	} while (pmd++, addr = next, addr != end);
	248
	249	start &= PUD_MASK;
	250	if (start < floor)
	251	return;
	252	if (ceiling) {
	253	ceiling &= PUD_MASK;
	254	if (!ceiling)
	255	return;
1da177e4	256	}
f10a04c0 DG	257	if (end - 1 > ceiling - 1)
f10a04c0 DG	258	return;
1da177e4	259
f10a04c0 DG	260	pmd = pmd_offset(pud, start);
f10a04c0 DG	261	pud_clear(pud);
9e1b32ca	262	pmd_free_tlb(tlb, pmd, start);
f10a04c0	263	}
f10a04c0 DG	264
	265	static void hugetlb_free_pud_range(struct mmu_gather tlb, pgd_t pgd,
	266	unsigned long addr, unsigned long end,
	267	unsigned long floor, unsigned long ceiling)
	268	{
	269	pud_t *pud;
	270	unsigned long next;
	271	unsigned long start;
	272
	273	start = addr;
	274	pud = pud_offset(pgd, addr);
	275	do {
	276	next = pud_addr_end(addr, end);
a4fe3ce7	277	if (!is_hugepd(pud)) {
4ec161cf JT	278	if (pud_none_or_clear_bad(pud))
4ec161cf JT	279	continue;
0d9ea754	280	hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
a4fe3ce7	281	ceiling);
4ec161cf	282	} else {
a4fe3ce7 DG	283	free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
a4fe3ce7 DG	284	addr, next, floor, ceiling);
4ec161cf	285	}
f10a04c0 DG	286	} while (pud++, addr = next, addr != end);
	287
	288	start &= PGDIR_MASK;
	289	if (start < floor)
	290	return;
	291	if (ceiling) {
	292	ceiling &= PGDIR_MASK;
	293	if (!ceiling)
	294	return;
	295	}
	296	if (end - 1 > ceiling - 1)
	297	return;
	298
	299	pud = pud_offset(pgd, start);
	300	pgd_clear(pgd);
9e1b32ca	301	pud_free_tlb(tlb, pud, start);
f10a04c0 DG	302	}
	303
	304	/*
	305	* This function frees user-level page tables of a process.
	306	*
	307	* Must be called with pagetable lock held.
	308	*/
42b77728	309	void hugetlb_free_pgd_range(struct mmu_gather *tlb,
f10a04c0 DG	310	unsigned long addr, unsigned long end,
	311	unsigned long floor, unsigned long ceiling)
	312	{
	313	pgd_t *pgd;
	314	unsigned long next;
f10a04c0 DG	315
f10a04c0 DG	316	/*
a4fe3ce7 DG	317	* Because there are a number of different possible pagetable
	318	* layouts for hugepage ranges, we limit knowledge of how
	319	* things should be laid out to the allocation path
	320	* (huge_pte_alloc(), above). Everything else works out the
	321	* structure as it goes from information in the hugepd
	322	* pointers. That means that we can't here use the
	323	* optimization used in the normal page free_pgd_range(), of
	324	* checking whether we're actually covering a large enough
	325	* range to have to do anything at the top level of the walk
	326	* instead of at the bottom.
f10a04c0	327	*
a4fe3ce7 DG	328	* To make sense of this, you should probably go read the big
	329	* block comment at the top of the normal free_pgd_range(),
	330	* too.
f10a04c0	331	*/
f10a04c0	332
42b77728	333	pgd = pgd_offset(tlb->mm, addr);
f10a04c0	334	do {
f10a04c0	335	next = pgd_addr_end(addr, end);
a4fe3ce7	336	if (!is_hugepd(pgd)) {
0b26425c DG	337	if (pgd_none_or_clear_bad(pgd))
	338	continue;
	339	hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
	340	} else {
a4fe3ce7 DG	341	free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
a4fe3ce7 DG	342	addr, next, floor, ceiling);
0b26425c	343	}
f10a04c0	344	} while (pgd++, addr = next, addr != end);
1da177e4 LT	345	}
1da177e4 LT	346
e28f7faf DG	347	void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
	348	pte_t *ptep, pte_t pte)
	349	{
e28f7faf	350	if (pte_present(*ptep)) {
3c726f8d	351	/* We open-code pte_clear because we need to pass the right
a741e679 BH	352	* argument to hpte_need_flush (huge / !huge). Might not be
	353	* necessary anymore if we make hpte_need_flush() get the
	354	* page size from the slices
3c726f8d	355	*/
f71dc176	356	pte_update(mm, addr, ptep, ~0UL, 1);
e28f7faf	357	}
3c726f8d	358	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
1da177e4 LT	359	}
1da177e4 LT	360
e28f7faf DG	361	pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
e28f7faf DG	362	pte_t *ptep)
1da177e4	363	{
a741e679	364	unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
e28f7faf	365	return __pte(old);
1da177e4 LT	366	}
1da177e4 LT	367
1da177e4 LT	368	struct page *
	369	follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
	370	{
	371	pte_t *ptep;
	372	struct page *page;
a4fe3ce7 DG	373	unsigned shift;
	374	unsigned long mask;
	375
	376	ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
1da177e4	377
0d9ea754	378	/* Verify it is a huge page else bail. */
a4fe3ce7	379	if (!ptep \|\| !shift)
1da177e4 LT	380	return ERR_PTR(-EINVAL);
1da177e4 LT	381
a4fe3ce7	382	mask = (1UL << shift) - 1;
1da177e4	383	page = pte_page(*ptep);
a4fe3ce7 DG	384	if (page)
a4fe3ce7 DG	385	page += (address & mask) / PAGE_SIZE;
1da177e4 LT	386
	387	return page;
	388	}
	389
	390	int pmd_huge(pmd_t pmd)
	391	{
	392	return 0;
	393	}
	394
ceb86879 AK	395	int pud_huge(pud_t pud)
	396	{
	397	return 0;
	398	}
	399
1da177e4 LT	400	struct page *
	401	follow_huge_pmd(struct mm_struct *mm, unsigned long address,
	402	pmd_t *pmd, int write)
	403	{
	404	BUG();
	405	return NULL;
	406	}
	407
a4fe3ce7 DG	408	static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
	409	unsigned long end, int write, struct page *pages, int nr)
	410	{
	411	unsigned long mask;
	412	unsigned long pte_end;
	413	struct page head, page;
	414	pte_t pte;
	415	int refs;
	416
	417	pte_end = (addr + sz) & ~(sz-1);
	418	if (pte_end < end)
	419	end = pte_end;
	420
	421	pte = *ptep;
	422	mask = _PAGE_PRESENT \| _PAGE_USER;
	423	if (write)
	424	mask \|= _PAGE_RW;
	425
	426	if ((pte_val(pte) & mask) != mask)
	427	return 0;
	428
	429	/* hugepages are never "special" */
	430	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
	431
	432	refs = 0;
	433	head = pte_page(pte);
	434
	435	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
	436	do {
	437	VM_BUG_ON(compound_head(page) != head);
	438	pages[*nr] = page;
	439	(*nr)++;
	440	page++;
	441	refs++;
	442	} while (addr += PAGE_SIZE, addr != end);
	443
	444	if (!page_cache_add_speculative(head, refs)) {
	445	*nr -= refs;
	446	return 0;
	447	}
	448
	449	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
	450	/* Could be optimized better */
	451	while (*nr) {
	452	put_page(page);
	453	(*nr)--;
	454	}
	455	}
	456
	457	return 1;
	458	}
	459
	460	int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
	461	unsigned long addr, unsigned long end,
	462	int write, struct page *pages, int nr)
	463	{
	464	pte_t *ptep;
	465	unsigned long sz = 1UL << hugepd_shift(*hugepd);
	466
	467	ptep = hugepte_offset(hugepd, addr, pdshift);
	468	do {
	469	if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
	470	return 0;
	471	} while (ptep++, addr += sz, addr != end);
472
473	return 1;
474	}
1da177e4 LT	475
	476	unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
	477	unsigned long len, unsigned long pgoff,
	478	unsigned long flags)
	479	{
0d9ea754 JT	480	struct hstate *hstate = hstate_file(file);
0d9ea754 JT	481	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
48f797de	482
0d9ea754	483	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
1da177e4 LT	484	}
1da177e4 LT	485
3340289d MG	486	unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
	487	{
	488	unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
	489
	490	return 1UL << mmu_psize_to_shift(psize);
	491	}
	492
d1837cba	493	static int __init add_huge_page_size(unsigned long long size)
4ec161cf	494	{
d1837cba DG	495	int shift = __ffs(size);
d1837cba DG	496	int mmu_psize;
a4fe3ce7	497
4ec161cf	498	/* Check that it is a page size supported by the hardware and
d1837cba DG	499	* that it fits within pagetable and slice limits. */
	500	if (!is_power_of_2(size)
	501	\|\| (shift > SLICE_HIGH_SHIFT) \|\| (shift <= PAGE_SHIFT))
	502	return -EINVAL;
91224346	503
d1837cba DG	504	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
	505	return -EINVAL;
	506
	507	#ifdef CONFIG_SPU_FS_64K_LS
	508	/* Disable support for 64K huge pages when 64K SPU local store
	509	* support is enabled as the current implementation conflicts.
	510	*/
	511	if (shift == PAGE_SHIFT_64K)
	512	return -EINVAL;
	513	#endif /* CONFIG_SPU_FS_64K_LS */
	514
	515	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
	516
	517	/* Return if huge page size has already been setup */
	518	if (size_to_hstate(size))
	519	return 0;
	520
	521	hugetlb_add_hstate(shift - PAGE_SHIFT);
	522
	523	return 0;
4ec161cf JT	524	}
	525
	526	static int __init hugepage_setup_sz(char *str)
	527	{
	528	unsigned long long size;
4ec161cf JT	529
	530	size = memparse(str, &str);
	531
d1837cba	532	if (add_huge_page_size(size) != 0)
4ec161cf JT	533	printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
	534
	535	return 1;
	536	}
	537	__setup("hugepagesz=", hugepage_setup_sz);
	538
f10a04c0 DG	539	static int __init hugetlbpage_init(void)
f10a04c0 DG	540	{
a4fe3ce7	541	int psize;
0d9ea754	542
f10a04c0 DG	543	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
f10a04c0 DG	544	return -ENODEV;
00df438e	545
d1837cba DG	546	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
	547	unsigned shift;
	548	unsigned pdshift;
0d9ea754	549
d1837cba DG	550	if (!mmu_psize_defs[psize].shift)
d1837cba DG	551	continue;
00df438e	552
d1837cba DG	553	shift = mmu_psize_to_shift(psize);
	554
	555	if (add_huge_page_size(1ULL << shift) < 0)
	556	continue;
	557
	558	if (shift < PMD_SHIFT)
	559	pdshift = PMD_SHIFT;
	560	else if (shift < PUD_SHIFT)
	561	pdshift = PUD_SHIFT;
	562	else
	563	pdshift = PGDIR_SHIFT;
	564
	565	pgtable_cache_add(pdshift - shift, NULL);
	566	if (!PGT_CACHE(pdshift - shift))
	567	panic("hugetlbpage_init(): could not create "
	568	"pgtable cache for %d bit pagesize\n", shift);
0d9ea754	569	}
f10a04c0	570
d1837cba DG	571	/* Set default large page size. Currently, we pick 16M or 1M
	572	* depending on what is available
	573	*/
	574	if (mmu_psize_defs[MMU_PAGE_16M].shift)
	575	HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
	576	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
	577	HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
	578
f10a04c0 DG	579	return 0;
	580	}
	581
	582	module_init(hugetlbpage_init);