[deliverable/linux.git] / arch / powerpc / mm / hugetlbpage.c

/*
 * PPC64 (POWER4) Huge TLB Page Support for Kernel.
 *
 * Copyright (C) 2003 David Gibson, IBM Corporation.
 *
 * Based on the IA-32 version:
 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
 */

#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/sysctl.h>
#include <asm/mman.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/machdep.h>
#include <asm/cputable.h>
#include <asm/spu.h>

#define HPAGE_SHIFT_64K	16
#define HPAGE_SHIFT_16M	24

#define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
#define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
#define MAX_NUMBER_GPAGES	1024

/* Tracks the 16G pages after the device tree is scanned and before the
 * huge_boot_pages list is ready.  */
static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
static unsigned nr_gpages;

unsigned int hugepte_shift;
#define PTRS_PER_HUGEPTE	(1 << hugepte_shift)
#define HUGEPTE_TABLE_SIZE	(sizeof(pte_t) << hugepte_shift)

#define HUGEPD_SHIFT		(HPAGE_SHIFT + hugepte_shift)
#define HUGEPD_SIZE		(1UL << HUGEPD_SHIFT)
#define HUGEPD_MASK		(~(HUGEPD_SIZE-1))

#define huge_pgtable_cache	(pgtable_cache[HUGEPTE_CACHE_NUM])

/* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
 * will choke on pointers to hugepte tables, which is handy for
 * catching screwups early. */
#define HUGEPD_OK	0x1

typedef struct { unsigned long pd; } hugepd_t;

#define hugepd_none(hpd)	((hpd).pd == 0)

static inline pte_t *hugepd_page(hugepd_t hpd)
{
	BUG_ON(!(hpd.pd & HUGEPD_OK));
	return (pte_t *)(hpd.pd & ~HUGEPD_OK);
}

static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
{
	unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
	pte_t *dir = hugepd_page(*hpdp);

	return dir + idx;
}

static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
			   unsigned long address)
{
	pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
				      GFP_KERNEL|__GFP_REPEAT);

	if (! new)
		return -ENOMEM;

	spin_lock(&mm->page_table_lock);
	if (!hugepd_none(*hpdp))
		kmem_cache_free(huge_pgtable_cache, new);
	else
		hpdp->pd = (unsigned long)new | HUGEPD_OK;
	spin_unlock(&mm->page_table_lock);
	return 0;
}

/* Base page size affects how we walk hugetlb page tables */
#ifdef CONFIG_PPC_64K_PAGES
#define hpmd_offset(pud, addr)		pmd_offset(pud, addr)
#define hpmd_alloc(mm, pud, addr)	pmd_alloc(mm, pud, addr)
#else
static inline
pmd_t *hpmd_offset(pud_t *pud, unsigned long addr)
{
	if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
		return pmd_offset(pud, addr);
	else
		return (pmd_t *) pud;
}
static inline
pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr)
{
	if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
		return pmd_alloc(mm, pud, addr);
	else
		return (pmd_t *) pud;
}
#endif

/* Moves the gigantic page addresses from the temporary list to the
 * huge_boot_pages list.  */
int alloc_bootmem_huge_page(struct hstate *h)
{
	struct huge_bootmem_page *m;
	if (nr_gpages == 0)
		return 0;
	m = phys_to_virt(gpage_freearray[--nr_gpages]);
	gpage_freearray[nr_gpages] = 0;
	list_add(&m->list, &huge_boot_pages);
	m->hstate = h;
	return 1;
}


/* Modelled after find_linux_pte() */
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
{
	pgd_t *pg;
	pud_t *pu;
	pmd_t *pm;

	BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);

	addr &= HPAGE_MASK;

	pg = pgd_offset(mm, addr);
	if (!pgd_none(*pg)) {
		pu = pud_offset(pg, addr);
		if (!pud_none(*pu)) {
			pm = hpmd_offset(pu, addr);
			if (!pmd_none(*pm))
				return hugepte_offset((hugepd_t *)pm, addr);
		}
	}

	return NULL;
}

pte_t *huge_pte_alloc(struct mm_struct *mm,
			unsigned long addr, unsigned long sz)
{
	pgd_t *pg;
	pud_t *pu;
	pmd_t *pm;
	hugepd_t *hpdp = NULL;

	BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);

	addr &= HPAGE_MASK;

	pg = pgd_offset(mm, addr);
	pu = pud_alloc(mm, pg, addr);

	if (pu) {
		pm = hpmd_alloc(mm, pu, addr);
		if (pm)
			hpdp = (hugepd_t *)pm;
	}

	if (! hpdp)
		return NULL;

	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
		return NULL;

	return hugepte_offset(hpdp, addr);
}

int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
{
	return 0;
}

static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
{
	pte_t *hugepte = hugepd_page(*hpdp);

	hpdp->pd = 0;
	tlb->need_flush = 1;
	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
						 PGF_CACHENUM_MASK));
}

static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
				   unsigned long addr, unsigned long end,
				   unsigned long floor, unsigned long ceiling)
{
	pmd_t *pmd;
	unsigned long next;
	unsigned long start;

	start = addr;
	pmd = pmd_offset(pud, addr);
	do {
		next = pmd_addr_end(addr, end);
		if (pmd_none(*pmd))
			continue;
		free_hugepte_range(tlb, (hugepd_t *)pmd);
	} while (pmd++, addr = next, addr != end);

	start &= PUD_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PUD_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	pmd = pmd_offset(pud, start);
	pud_clear(pud);
	pmd_free_tlb(tlb, pmd);
}

static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
				   unsigned long addr, unsigned long end,
				   unsigned long floor, unsigned long ceiling)
{
	pud_t *pud;
	unsigned long next;
	unsigned long start;

	start = addr;
	pud = pud_offset(pgd, addr);
	do {
		next = pud_addr_end(addr, end);
#ifdef CONFIG_PPC_64K_PAGES
		if (pud_none_or_clear_bad(pud))
			continue;
		hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
#else
		if (HPAGE_SHIFT == HPAGE_SHIFT_64K) {
			if (pud_none_or_clear_bad(pud))
				continue;
			hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
		} else {
			if (pud_none(*pud))
				continue;
			free_hugepte_range(tlb, (hugepd_t *)pud);
		}
#endif
	} while (pud++, addr = next, addr != end);

	start &= PGDIR_MASK;
	if (start < floor)
		return;
	if (ceiling) {
		ceiling &= PGDIR_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		return;

	pud = pud_offset(pgd, start);
	pgd_clear(pgd);
	pud_free_tlb(tlb, pud);
}

/*
 * This function frees user-level page tables of a process.
 *
 * Must be called with pagetable lock held.
 */
void hugetlb_free_pgd_range(struct mmu_gather *tlb,
			    unsigned long addr, unsigned long end,
			    unsigned long floor, unsigned long ceiling)
{
	pgd_t *pgd;
	unsigned long next;
	unsigned long start;

	/*
	 * Comments below take from the normal free_pgd_range().  They
	 * apply here too.  The tests against HUGEPD_MASK below are
	 * essential, because we *don't* test for this at the bottom
	 * level.  Without them we'll attempt to free a hugepte table
	 * when we unmap just part of it, even if there are other
	 * active mappings using it.
	 *
	 * The next few lines have given us lots of grief...
	 *
	 * Why are we testing HUGEPD* at this top level?  Because
	 * often there will be no work to do at all, and we'd prefer
	 * not to go all the way down to the bottom just to discover
	 * that.
	 *
	 * Why all these "- 1"s?  Because 0 represents both the bottom
	 * of the address space and the top of it (using -1 for the
	 * top wouldn't help much: the masks would do the wrong thing).
	 * The rule is that addr 0 and floor 0 refer to the bottom of
	 * the address space, but end 0 and ceiling 0 refer to the top
	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
	 * that end 0 case should be mythical).
	 *
	 * Wherever addr is brought up or ceiling brought down, we
	 * must be careful to reject "the opposite 0" before it
	 * confuses the subsequent tests.  But what about where end is
	 * brought down by HUGEPD_SIZE below? no, end can't go down to
	 * 0 there.
	 *
	 * Whereas we round start (addr) and ceiling down, by different
	 * masks at different levels, in order to test whether a table
	 * now has no other vmas using it, so can be freed, we don't
	 * bother to round floor or end up - the tests don't need that.
	 */

	addr &= HUGEPD_MASK;
	if (addr < floor) {
		addr += HUGEPD_SIZE;
		if (!addr)
			return;
	}
	if (ceiling) {
		ceiling &= HUGEPD_MASK;
		if (!ceiling)
			return;
	}
	if (end - 1 > ceiling - 1)
		end -= HUGEPD_SIZE;
	if (addr > end - 1)
		return;

	start = addr;
	pgd = pgd_offset(tlb->mm, addr);
	do {
		BUG_ON(get_slice_psize(tlb->mm, addr) != mmu_huge_psize);
		next = pgd_addr_end(addr, end);
		if (pgd_none_or_clear_bad(pgd))
			continue;
		hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
	} while (pgd++, addr = next, addr != end);
}

void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
		     pte_t *ptep, pte_t pte)
{
	if (pte_present(*ptep)) {
		/* We open-code pte_clear because we need to pass the right
		 * argument to hpte_need_flush (huge / !huge). Might not be
		 * necessary anymore if we make hpte_need_flush() get the
		 * page size from the slices
		 */
		pte_update(mm, addr & HPAGE_MASK, ptep, ~0UL, 1);
	}
	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
}

pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
			      pte_t *ptep)
{
	unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
	return __pte(old);
}

struct page *
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
{
	pte_t *ptep;
	struct page *page;

	if (get_slice_psize(mm, address) != mmu_huge_psize)
		return ERR_PTR(-EINVAL);

	ptep = huge_pte_offset(mm, address);
	page = pte_page(*ptep);
	if (page)
		page += (address % HPAGE_SIZE) / PAGE_SIZE;

	return page;
}

int pmd_huge(pmd_t pmd)
{
	return 0;
}

int pud_huge(pud_t pud)
{
	return 0;
}

struct page *
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
		pmd_t *pmd, int write)
{
	BUG();
	return NULL;
}


unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
					unsigned long len, unsigned long pgoff,
					unsigned long flags)
{
	return slice_get_unmapped_area(addr, len, flags,
				       mmu_huge_psize, 1, 0);
}

/*
 * Called by asm hashtable.S for doing lazy icache flush
 */
static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
						  pte_t pte, int trap)
{
	struct page *page;
	int i;

	if (!pfn_valid(pte_pfn(pte)))
		return rflags;

	page = pte_page(pte);

	/* page is dirty */
	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
		if (trap == 0x400) {
			for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
				__flush_dcache_icache(page_address(page+i));
			set_bit(PG_arch_1, &page->flags);
		} else {
			rflags |= HPTE_R_N;
		}
	}
	return rflags;
}

int hash_huge_page(struct mm_struct *mm, unsigned long access,
		   unsigned long ea, unsigned long vsid, int local,
		   unsigned long trap)
{
	pte_t *ptep;
	unsigned long old_pte, new_pte;
	unsigned long va, rflags, pa;
	long slot;
	int err = 1;
	int ssize = user_segment_size(ea);

	ptep = huge_pte_offset(mm, ea);

	/* Search the Linux page table for a match with va */
	va = hpt_va(ea, vsid, ssize);

	/*
	 * If no pte found or not present, send the problem up to
	 * do_page_fault
	 */
	if (unlikely(!ptep || pte_none(*ptep)))
		goto out;

	/* 
	 * Check the user's access rights to the page.  If access should be
	 * prevented then send the problem up to do_page_fault.
	 */
	if (unlikely(access & ~pte_val(*ptep)))
		goto out;
	/*
	 * At this point, we have a pte (old_pte) which can be used to build
	 * or update an HPTE. There are 2 cases:
	 *
	 * 1. There is a valid (present) pte with no associated HPTE (this is 
	 *	the most common case)
	 * 2. There is a valid (present) pte with an associated HPTE. The
	 *	current values of the pp bits in the HPTE prevent access
	 *	because we are doing software DIRTY bit management and the
	 *	page is currently not DIRTY. 
	 */


	do {
		old_pte = pte_val(*ptep);
		if (old_pte & _PAGE_BUSY)
			goto out;
		new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
					 old_pte, new_pte));

	rflags = 0x2 | (!(new_pte & _PAGE_RW));
 	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
		/* No CPU has hugepages but lacks no execute, so we
		 * don't need to worry about that case */
		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
						       trap);

	/* Check if pte already has an hpte (case 2) */
	if (unlikely(old_pte & _PAGE_HASHPTE)) {
		/* There MIGHT be an HPTE for this pte */
		unsigned long hash, slot;

		hash = hpt_hash(va, HPAGE_SHIFT, ssize);
		if (old_pte & _PAGE_F_SECOND)
			hash = ~hash;
		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
		slot += (old_pte & _PAGE_F_GIX) >> 12;

		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
					 ssize, local) == -1)
			old_pte &= ~_PAGE_HPTEFLAGS;
	}

	if (likely(!(old_pte & _PAGE_HASHPTE))) {
		unsigned long hash = hpt_hash(va, HPAGE_SHIFT, ssize);
		unsigned long hpte_group;

		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;

repeat:
		hpte_group = ((hash & htab_hash_mask) *
			      HPTES_PER_GROUP) & ~0x7UL;

		/* clear HPTE slot informations in new PTE */
#ifdef CONFIG_PPC_64K_PAGES
		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
#else
		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
#endif
		/* Add in WIMG bits */
		rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
				      _PAGE_COHERENT | _PAGE_GUARDED));

		/* Insert into the hash table, primary slot */
		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
					  mmu_huge_psize, ssize);

		/* Primary is full, try the secondary */
		if (unlikely(slot == -1)) {
			hpte_group = ((~hash & htab_hash_mask) *
				      HPTES_PER_GROUP) & ~0x7UL; 
			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
						  HPTE_V_SECONDARY,
						  mmu_huge_psize, ssize);
			if (slot == -1) {
				if (mftb() & 0x1)
					hpte_group = ((hash & htab_hash_mask) *
						      HPTES_PER_GROUP)&~0x7UL;

				ppc_md.hpte_remove(hpte_group);
				goto repeat;
                        }
		}

		if (unlikely(slot == -2))
			panic("hash_huge_page: pte_insert failed\n");

		new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
	}

	/*
	 * No need to use ldarx/stdcx here
	 */
	*ptep = __pte(new_pte & ~_PAGE_BUSY);

	err = 0;

 out:
	return err;
}

void set_huge_psize(int psize)
{
	/* Check that it is a page size supported by the hardware and
	 * that it fits within pagetable limits. */
	if (mmu_psize_defs[psize].shift && mmu_psize_defs[psize].shift < SID_SHIFT &&
		(mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
			mmu_psize_defs[psize].shift == HPAGE_SHIFT_64K)) {
		HPAGE_SHIFT = mmu_psize_defs[psize].shift;
		mmu_huge_psize = psize;
#ifdef CONFIG_PPC_64K_PAGES
		hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
#else
		if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
			hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
		else
			hugepte_shift = (PUD_SHIFT-HPAGE_SHIFT);
#endif

	} else
		HPAGE_SHIFT = 0;
}

static int __init hugepage_setup_sz(char *str)
{
	unsigned long long size;
	int mmu_psize = -1;
	int shift;

	size = memparse(str, &str);

	shift = __ffs(size);
	switch (shift) {
#ifndef CONFIG_PPC_64K_PAGES
	case HPAGE_SHIFT_64K:
		mmu_psize = MMU_PAGE_64K;
		break;
#endif
	case HPAGE_SHIFT_16M:
		mmu_psize = MMU_PAGE_16M;
		break;
	}

	if (mmu_psize >=0 && mmu_psize_defs[mmu_psize].shift)
		set_huge_psize(mmu_psize);
	else
		printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);

	return 1;
}
__setup("hugepagesz=", hugepage_setup_sz);

static void zero_ctor(struct kmem_cache *cache, void *addr)
{
	memset(addr, 0, kmem_cache_size(cache));
}

static int __init hugetlbpage_init(void)
{
	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
		return -ENODEV;

	huge_pgtable_cache = kmem_cache_create("hugepte_cache",
					       HUGEPTE_TABLE_SIZE,
					       HUGEPTE_TABLE_SIZE,
					       0,
					       zero_ctor);
	if (! huge_pgtable_cache)
		panic("hugetlbpage_init(): could not create hugepte cache\n");

	return 0;
}

module_init(hugetlbpage_init);
Commit	Line	Data
1da177e4 LT	1	/*
	2	* PPC64 (POWER4) Huge TLB Page Support for Kernel.
	3	*
	4	* Copyright (C) 2003 David Gibson, IBM Corporation.
	5	*
	6	* Based on the IA-32 version:
	7	* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
	8	*/
	9
	10	#include <linux/init.h>
	11	#include <linux/fs.h>
	12	#include <linux/mm.h>
	13	#include <linux/hugetlb.h>
	14	#include <linux/pagemap.h>
1da177e4 LT	15	#include <linux/slab.h>
	16	#include <linux/err.h>
	17	#include <linux/sysctl.h>
	18	#include <asm/mman.h>
	19	#include <asm/pgalloc.h>
	20	#include <asm/tlb.h>
	21	#include <asm/tlbflush.h>
	22	#include <asm/mmu_context.h>
	23	#include <asm/machdep.h>
	24	#include <asm/cputable.h>
94b2a439	25	#include <asm/spu.h>
1da177e4	26
4ec161cf JT	27	#define HPAGE_SHIFT_64K 16
	28	#define HPAGE_SHIFT_16M 24
	29
c594adad DG	30	#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT)
c594adad DG	31	#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
ec4b2c0c JT	32	#define MAX_NUMBER_GPAGES 1024
	33
	34	/* Tracks the 16G pages after the device tree is scanned and before the
	35	* huge_boot_pages list is ready. */
	36	static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
	37	static unsigned nr_gpages;
c594adad	38
4ec161cf JT	39	unsigned int hugepte_shift;
	40	#define PTRS_PER_HUGEPTE (1 << hugepte_shift)
	41	#define HUGEPTE_TABLE_SIZE (sizeof(pte_t) << hugepte_shift)
f10a04c0	42
4ec161cf	43	#define HUGEPD_SHIFT (HPAGE_SHIFT + hugepte_shift)
f10a04c0 DG	44	#define HUGEPD_SIZE (1UL << HUGEPD_SHIFT)
	45	#define HUGEPD_MASK (~(HUGEPD_SIZE-1))
	46
	47	#define huge_pgtable_cache (pgtable_cache[HUGEPTE_CACHE_NUM])
	48
	49	/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
	50	* will choke on pointers to hugepte tables, which is handy for
	51	* catching screwups early. */
	52	#define HUGEPD_OK 0x1
	53
	54	typedef struct { unsigned long pd; } hugepd_t;
	55
	56	#define hugepd_none(hpd) ((hpd).pd == 0)
	57
	58	static inline pte_t *hugepd_page(hugepd_t hpd)
	59	{
	60	BUG_ON(!(hpd.pd & HUGEPD_OK));
	61	return (pte_t *)(hpd.pd & ~HUGEPD_OK);
	62	}
	63
	64	static inline pte_t hugepte_offset(hugepd_t hpdp, unsigned long addr)
	65	{
	66	unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
	67	pte_t dir = hugepd_page(hpdp);
	68
	69	return dir + idx;
	70	}
	71
	72	static int __hugepte_alloc(struct mm_struct mm, hugepd_t hpdp,
	73	unsigned long address)
	74	{
	75	pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
	76	GFP_KERNEL\|__GFP_REPEAT);
	77
	78	if (! new)
	79	return -ENOMEM;
	80
	81	spin_lock(&mm->page_table_lock);
	82	if (!hugepd_none(*hpdp))
	83	kmem_cache_free(huge_pgtable_cache, new);
	84	else
	85	hpdp->pd = (unsigned long)new \| HUGEPD_OK;
	86	spin_unlock(&mm->page_table_lock);
	87	return 0;
	88	}
	89
4ec161cf JT	90	/* Base page size affects how we walk hugetlb page tables */
	91	#ifdef CONFIG_PPC_64K_PAGES
	92	#define hpmd_offset(pud, addr) pmd_offset(pud, addr)
	93	#define hpmd_alloc(mm, pud, addr) pmd_alloc(mm, pud, addr)
	94	#else
	95	static inline
	96	pmd_t hpmd_offset(pud_t pud, unsigned long addr)
	97	{
	98	if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
	99	return pmd_offset(pud, addr);
	100	else
	101	return (pmd_t *) pud;
	102	}
	103	static inline
	104	pmd_t hpmd_alloc(struct mm_struct mm, pud_t *pud, unsigned long addr)
	105	{
	106	if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
	107	return pmd_alloc(mm, pud, addr);
	108	else
	109	return (pmd_t *) pud;
	110	}
	111	#endif
	112
ec4b2c0c JT	113	/* Moves the gigantic page addresses from the temporary list to the
	114	* huge_boot_pages list. */
	115	int alloc_bootmem_huge_page(struct hstate *h)
	116	{
	117	struct huge_bootmem_page *m;
	118	if (nr_gpages == 0)
	119	return 0;
	120	m = phys_to_virt(gpage_freearray[--nr_gpages]);
	121	gpage_freearray[nr_gpages] = 0;
	122	list_add(&m->list, &huge_boot_pages);
	123	m->hstate = h;
	124	return 1;
	125	}
	126
	127
e28f7faf DG	128	/* Modelled after find_linux_pte() */
e28f7faf DG	129	pte_t huge_pte_offset(struct mm_struct mm, unsigned long addr)
1da177e4	130	{
e28f7faf DG	131	pgd_t *pg;
e28f7faf DG	132	pud_t *pu;
4ec161cf	133	pmd_t *pm;
1da177e4	134
d0f13e3c	135	BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
1da177e4	136
e28f7faf DG	137	addr &= HPAGE_MASK;
	138
	139	pg = pgd_offset(mm, addr);
	140	if (!pgd_none(*pg)) {
	141	pu = pud_offset(pg, addr);
	142	if (!pud_none(*pu)) {
4ec161cf	143	pm = hpmd_offset(pu, addr);
f10a04c0 DG	144	if (!pmd_none(*pm))
f10a04c0 DG	145	return hugepte_offset((hugepd_t *)pm, addr);
e28f7faf DG	146	}
e28f7faf DG	147	}
1da177e4	148
e28f7faf	149	return NULL;
1da177e4 LT	150	}
1da177e4 LT	151
a5516438 AK	152	pte_t huge_pte_alloc(struct mm_struct mm,
a5516438 AK	153	unsigned long addr, unsigned long sz)
1da177e4	154	{
e28f7faf DG	155	pgd_t *pg;
e28f7faf DG	156	pud_t *pu;
4ec161cf	157	pmd_t *pm;
f10a04c0	158	hugepd_t *hpdp = NULL;
1da177e4	159
d0f13e3c	160	BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
1da177e4	161
e28f7faf	162	addr &= HPAGE_MASK;
1da177e4	163
e28f7faf DG	164	pg = pgd_offset(mm, addr);
e28f7faf DG	165	pu = pud_alloc(mm, pg, addr);
1da177e4	166
e28f7faf	167	if (pu) {
4ec161cf	168	pm = hpmd_alloc(mm, pu, addr);
f10a04c0 DG	169	if (pm)
f10a04c0 DG	170	hpdp = (hugepd_t *)pm;
f10a04c0 DG	171	}
	172
	173	if (! hpdp)
	174	return NULL;
	175
	176	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
	177	return NULL;
	178
	179	return hugepte_offset(hpdp, addr);
	180	}
	181
39dde65c CK	182	int huge_pmd_unshare(struct mm_struct mm, unsigned long addr, pte_t *ptep)
	183	{
	184	return 0;
	185	}
	186
f10a04c0 DG	187	static void free_hugepte_range(struct mmu_gather tlb, hugepd_t hpdp)
	188	{
	189	pte_t hugepte = hugepd_page(hpdp);
	190
	191	hpdp->pd = 0;
	192	tlb->need_flush = 1;
	193	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
c9169f87	194	PGF_CACHENUM_MASK));
f10a04c0 DG	195	}
f10a04c0 DG	196
f10a04c0 DG	197	static void hugetlb_free_pmd_range(struct mmu_gather tlb, pud_t pud,
	198	unsigned long addr, unsigned long end,
	199	unsigned long floor, unsigned long ceiling)
	200	{
	201	pmd_t *pmd;
	202	unsigned long next;
	203	unsigned long start;
	204
	205	start = addr;
	206	pmd = pmd_offset(pud, addr);
	207	do {
	208	next = pmd_addr_end(addr, end);
	209	if (pmd_none(*pmd))
	210	continue;
	211	free_hugepte_range(tlb, (hugepd_t *)pmd);
	212	} while (pmd++, addr = next, addr != end);
	213
	214	start &= PUD_MASK;
	215	if (start < floor)
	216	return;
	217	if (ceiling) {
	218	ceiling &= PUD_MASK;
	219	if (!ceiling)
	220	return;
1da177e4	221	}
f10a04c0 DG	222	if (end - 1 > ceiling - 1)
f10a04c0 DG	223	return;
1da177e4	224
f10a04c0 DG	225	pmd = pmd_offset(pud, start);
	226	pud_clear(pud);
	227	pmd_free_tlb(tlb, pmd);
	228	}
f10a04c0 DG	229
	230	static void hugetlb_free_pud_range(struct mmu_gather tlb, pgd_t pgd,
	231	unsigned long addr, unsigned long end,
	232	unsigned long floor, unsigned long ceiling)
	233	{
	234	pud_t *pud;
	235	unsigned long next;
	236	unsigned long start;
	237
	238	start = addr;
	239	pud = pud_offset(pgd, addr);
	240	do {
	241	next = pud_addr_end(addr, end);
	242	#ifdef CONFIG_PPC_64K_PAGES
	243	if (pud_none_or_clear_bad(pud))
	244	continue;
	245	hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
	246	#else
4ec161cf JT	247	if (HPAGE_SHIFT == HPAGE_SHIFT_64K) {
	248	if (pud_none_or_clear_bad(pud))
	249	continue;
	250	hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
	251	} else {
	252	if (pud_none(*pud))
	253	continue;
	254	free_hugepte_range(tlb, (hugepd_t *)pud);
	255	}
f10a04c0 DG	256	#endif
	257	} while (pud++, addr = next, addr != end);
	258
	259	start &= PGDIR_MASK;
	260	if (start < floor)
	261	return;
	262	if (ceiling) {
	263	ceiling &= PGDIR_MASK;
	264	if (!ceiling)
	265	return;
	266	}
	267	if (end - 1 > ceiling - 1)
	268	return;
	269
	270	pud = pud_offset(pgd, start);
	271	pgd_clear(pgd);
	272	pud_free_tlb(tlb, pud);
	273	}
	274
	275	/*
	276	* This function frees user-level page tables of a process.
	277	*
	278	* Must be called with pagetable lock held.
	279	*/
42b77728	280	void hugetlb_free_pgd_range(struct mmu_gather *tlb,
f10a04c0 DG	281	unsigned long addr, unsigned long end,
	282	unsigned long floor, unsigned long ceiling)
	283	{
	284	pgd_t *pgd;
	285	unsigned long next;
	286	unsigned long start;
	287
	288	/*
	289	* Comments below take from the normal free_pgd_range(). They
	290	* apply here too. The tests against HUGEPD_MASK below are
	291	* essential, because we don't test for this at the bottom
	292	* level. Without them we'll attempt to free a hugepte table
	293	* when we unmap just part of it, even if there are other
	294	* active mappings using it.
	295	*
	296	* The next few lines have given us lots of grief...
	297	*
	298	* Why are we testing HUGEPD* at this top level? Because
	299	* often there will be no work to do at all, and we'd prefer
	300	* not to go all the way down to the bottom just to discover
	301	* that.
	302	*
	303	* Why all these "- 1"s? Because 0 represents both the bottom
	304	* of the address space and the top of it (using -1 for the
	305	* top wouldn't help much: the masks would do the wrong thing).
	306	* The rule is that addr 0 and floor 0 refer to the bottom of
	307	* the address space, but end 0 and ceiling 0 refer to the top
	308	* Comparisons need to use "end - 1" and "ceiling - 1" (though
	309	* that end 0 case should be mythical).
	310	*
	311	* Wherever addr is brought up or ceiling brought down, we
	312	* must be careful to reject "the opposite 0" before it
	313	* confuses the subsequent tests. But what about where end is
	314	* brought down by HUGEPD_SIZE below? no, end can't go down to
	315	* 0 there.
	316	*
	317	* Whereas we round start (addr) and ceiling down, by different
	318	* masks at different levels, in order to test whether a table
	319	* now has no other vmas using it, so can be freed, we don't
	320	* bother to round floor or end up - the tests don't need that.
	321	*/
	322
	323	addr &= HUGEPD_MASK;
	324	if (addr < floor) {
	325	addr += HUGEPD_SIZE;
	326	if (!addr)
	327	return;
	328	}
	329	if (ceiling) {
	330	ceiling &= HUGEPD_MASK;
	331	if (!ceiling)
	332	return;
	333	}
	334	if (end - 1 > ceiling - 1)
	335	end -= HUGEPD_SIZE;
	336	if (addr > end - 1)
	337	return;
	338
	339	start = addr;
42b77728	340	pgd = pgd_offset(tlb->mm, addr);
f10a04c0	341	do {
42b77728	342	BUG_ON(get_slice_psize(tlb->mm, addr) != mmu_huge_psize);
f10a04c0 DG	343	next = pgd_addr_end(addr, end);
	344	if (pgd_none_or_clear_bad(pgd))
	345	continue;
42b77728	346	hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
f10a04c0	347	} while (pgd++, addr = next, addr != end);
1da177e4 LT	348	}
1da177e4 LT	349
e28f7faf DG	350	void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
	351	pte_t *ptep, pte_t pte)
	352	{
e28f7faf	353	if (pte_present(*ptep)) {
3c726f8d	354	/* We open-code pte_clear because we need to pass the right
a741e679 BH	355	* argument to hpte_need_flush (huge / !huge). Might not be
	356	* necessary anymore if we make hpte_need_flush() get the
	357	* page size from the slices
3c726f8d	358	*/
a741e679	359	pte_update(mm, addr & HPAGE_MASK, ptep, ~0UL, 1);
e28f7faf	360	}
3c726f8d	361	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
1da177e4 LT	362	}
1da177e4 LT	363
e28f7faf DG	364	pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
e28f7faf DG	365	pte_t *ptep)
1da177e4	366	{
a741e679	367	unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
e28f7faf	368	return __pte(old);
1da177e4 LT	369	}
1da177e4 LT	370
1da177e4 LT	371	struct page *
	372	follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
	373	{
	374	pte_t *ptep;
	375	struct page *page;
	376
d0f13e3c	377	if (get_slice_psize(mm, address) != mmu_huge_psize)
1da177e4 LT	378	return ERR_PTR(-EINVAL);
	379
	380	ptep = huge_pte_offset(mm, address);
	381	page = pte_page(*ptep);
	382	if (page)
	383	page += (address % HPAGE_SIZE) / PAGE_SIZE;
	384
	385	return page;
	386	}
	387
	388	int pmd_huge(pmd_t pmd)
	389	{
	390	return 0;
	391	}
	392
ceb86879 AK	393	int pud_huge(pud_t pud)
	394	{
	395	return 0;
	396	}
	397
1da177e4 LT	398	struct page *
	399	follow_huge_pmd(struct mm_struct *mm, unsigned long address,
	400	pmd_t *pmd, int write)
	401	{
	402	BUG();
	403	return NULL;
	404	}
	405
1da177e4 LT	406
	407	unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
	408	unsigned long len, unsigned long pgoff,
	409	unsigned long flags)
	410	{
d0f13e3c BH	411	return slice_get_unmapped_area(addr, len, flags,
d0f13e3c BH	412	mmu_huge_psize, 1, 0);
1da177e4 LT	413	}
1da177e4 LT	414
cbf52afd DG	415	/*
	416	* Called by asm hashtable.S for doing lazy icache flush
	417	*/
	418	static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
	419	pte_t pte, int trap)
	420	{
	421	struct page *page;
	422	int i;
	423
	424	if (!pfn_valid(pte_pfn(pte)))
	425	return rflags;
	426
	427	page = pte_page(pte);
	428
	429	/* page is dirty */
	430	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
	431	if (trap == 0x400) {
	432	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
	433	__flush_dcache_icache(page_address(page+i));
	434	set_bit(PG_arch_1, &page->flags);
	435	} else {
	436	rflags \|= HPTE_R_N;
	437	}
	438	}
	439	return rflags;
	440	}
	441
1da177e4	442	int hash_huge_page(struct mm_struct *mm, unsigned long access,
cbf52afd DG	443	unsigned long ea, unsigned long vsid, int local,
cbf52afd DG	444	unsigned long trap)
1da177e4 LT	445	{
1da177e4 LT	446	pte_t *ptep;
3c726f8d BH	447	unsigned long old_pte, new_pte;
3c726f8d BH	448	unsigned long va, rflags, pa;
1da177e4 LT	449	long slot;
1da177e4 LT	450	int err = 1;
1189be65	451	int ssize = user_segment_size(ea);
1da177e4	452
1da177e4 LT	453	ptep = huge_pte_offset(mm, ea);
	454
	455	/* Search the Linux page table for a match with va */
1189be65	456	va = hpt_va(ea, vsid, ssize);
1da177e4 LT	457
	458	/*
	459	* If no pte found or not present, send the problem up to
	460	* do_page_fault
	461	*/
	462	if (unlikely(!ptep \|\| pte_none(*ptep)))
	463	goto out;
	464
1da177e4 LT	465	/*
	466	* Check the user's access rights to the page. If access should be
	467	* prevented then send the problem up to do_page_fault.
	468	*/
	469	if (unlikely(access & ~pte_val(*ptep)))
	470	goto out;
	471	/*
	472	* At this point, we have a pte (old_pte) which can be used to build
	473	* or update an HPTE. There are 2 cases:
	474	*
	475	* 1. There is a valid (present) pte with no associated HPTE (this is
	476	* the most common case)
	477	* 2. There is a valid (present) pte with an associated HPTE. The
	478	* current values of the pp bits in the HPTE prevent access
	479	* because we are doing software DIRTY bit management and the
	480	* page is currently not DIRTY.
	481	*/
	482
	483
3c726f8d BH	484	do {
	485	old_pte = pte_val(*ptep);
	486	if (old_pte & _PAGE_BUSY)
	487	goto out;
41743a4e	488	new_pte = old_pte \| _PAGE_BUSY \| _PAGE_ACCESSED;
3c726f8d BH	489	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
	490	old_pte, new_pte));
	491
	492	rflags = 0x2 \| (!(new_pte & _PAGE_RW));
1da177e4	493	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
3c726f8d	494	rflags \|= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
cbf52afd DG	495	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
	496	/* No CPU has hugepages but lacks no execute, so we
	497	* don't need to worry about that case */
	498	rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
	499	trap);
1da177e4 LT	500
1da177e4 LT	501	/* Check if pte already has an hpte (case 2) */
3c726f8d	502	if (unlikely(old_pte & _PAGE_HASHPTE)) {
1da177e4 LT	503	/* There MIGHT be an HPTE for this pte */
	504	unsigned long hash, slot;
	505
1189be65	506	hash = hpt_hash(va, HPAGE_SHIFT, ssize);
3c726f8d	507	if (old_pte & _PAGE_F_SECOND)
1da177e4 LT	508	hash = ~hash;
1da177e4 LT	509	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
3c726f8d	510	slot += (old_pte & _PAGE_F_GIX) >> 12;
1da177e4	511
325c82a0	512	if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
1189be65	513	ssize, local) == -1)
3c726f8d	514	old_pte &= ~_PAGE_HPTEFLAGS;
1da177e4 LT	515	}
1da177e4 LT	516
3c726f8d	517	if (likely(!(old_pte & _PAGE_HASHPTE))) {
1189be65	518	unsigned long hash = hpt_hash(va, HPAGE_SHIFT, ssize);
1da177e4 LT	519	unsigned long hpte_group;
1da177e4 LT	520
3c726f8d	521	pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
1da177e4 LT	522
	523	repeat:
	524	hpte_group = ((hash & htab_hash_mask) *
	525	HPTES_PER_GROUP) & ~0x7UL;
	526
3c726f8d	527	/* clear HPTE slot informations in new PTE */
41743a4e BH	528	#ifdef CONFIG_PPC_64K_PAGES
	529	new_pte = (new_pte & ~_PAGE_HPTEFLAGS) \| _PAGE_HPTE_SUB0;
	530	#else
3c726f8d	531	new_pte = (new_pte & ~_PAGE_HPTEFLAGS) \| _PAGE_HASHPTE;
41743a4e	532	#endif
1da177e4	533	/* Add in WIMG bits */
87e9ab13 DK	534	rflags \|= (new_pte & (_PAGE_WRITETHRU \| _PAGE_NO_CACHE \|
87e9ab13 DK	535	_PAGE_COHERENT \| _PAGE_GUARDED));
1da177e4	536
3c726f8d BH	537	/* Insert into the hash table, primary slot */
3c726f8d BH	538	slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
1189be65	539	mmu_huge_psize, ssize);
1da177e4 LT	540
	541	/* Primary is full, try the secondary */
	542	if (unlikely(slot == -1)) {
1da177e4 LT	543	hpte_group = ((~hash & htab_hash_mask) *
1da177e4 LT	544	HPTES_PER_GROUP) & ~0x7UL;
3c726f8d	545	slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
67b10813	546	HPTE_V_SECONDARY,
1189be65	547	mmu_huge_psize, ssize);
1da177e4 LT	548	if (slot == -1) {
1da177e4 LT	549	if (mftb() & 0x1)
67b10813 BH	550	hpte_group = ((hash & htab_hash_mask) *
67b10813 BH	551	HPTES_PER_GROUP)&~0x7UL;
1da177e4 LT	552
	553	ppc_md.hpte_remove(hpte_group);
	554	goto repeat;
	555	}
	556	}
	557
	558	if (unlikely(slot == -2))
	559	panic("hash_huge_page: pte_insert failed\n");
	560
d649bd7b	561	new_pte \|= (slot << 12) & (_PAGE_F_SECOND \| _PAGE_F_GIX);
1da177e4 LT	562	}
1da177e4 LT	563
3c726f8d	564	/*
01edcd89	565	* No need to use ldarx/stdcx here
3c726f8d BH	566	*/
	567	*ptep = __pte(new_pte & ~_PAGE_BUSY);
	568
1da177e4 LT	569	err = 0;
	570
	571	out:
1da177e4 LT	572	return err;
1da177e4 LT	573	}
f10a04c0	574
4ec161cf JT	575	void set_huge_psize(int psize)
	576	{
	577	/* Check that it is a page size supported by the hardware and
	578	* that it fits within pagetable limits. */
	579	if (mmu_psize_defs[psize].shift && mmu_psize_defs[psize].shift < SID_SHIFT &&
	580	(mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT \|\|
	581	mmu_psize_defs[psize].shift == HPAGE_SHIFT_64K)) {
	582	HPAGE_SHIFT = mmu_psize_defs[psize].shift;
	583	mmu_huge_psize = psize;
	584	#ifdef CONFIG_PPC_64K_PAGES
	585	hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
	586	#else
	587	if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
	588	hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
	589	else
	590	hugepte_shift = (PUD_SHIFT-HPAGE_SHIFT);
	591	#endif
	592
	593	} else
	594	HPAGE_SHIFT = 0;
	595	}
	596
	597	static int __init hugepage_setup_sz(char *str)
	598	{
	599	unsigned long long size;
	600	int mmu_psize = -1;
	601	int shift;
	602
	603	size = memparse(str, &str);
	604
	605	shift = __ffs(size);
	606	switch (shift) {
	607	#ifndef CONFIG_PPC_64K_PAGES
	608	case HPAGE_SHIFT_64K:
	609	mmu_psize = MMU_PAGE_64K;
	610	break;
	611	#endif
	612	case HPAGE_SHIFT_16M:
	613	mmu_psize = MMU_PAGE_16M;
	614	break;
	615	}
	616
	617	if (mmu_psize >=0 && mmu_psize_defs[mmu_psize].shift)
	618	set_huge_psize(mmu_psize);
	619	else
	620	printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
	621
	622	return 1;
	623	}
	624	__setup("hugepagesz=", hugepage_setup_sz);
	625
4ba9b9d0	626	static void zero_ctor(struct kmem_cache cache, void addr)
f10a04c0 DG	627	{
	628	memset(addr, 0, kmem_cache_size(cache));
	629	}
	630
	631	static int __init hugetlbpage_init(void)
	632	{
	633	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
	634	return -ENODEV;
	635
	636	huge_pgtable_cache = kmem_cache_create("hugepte_cache",
	637	HUGEPTE_TABLE_SIZE,
	638	HUGEPTE_TABLE_SIZE,
f0f3980b	639	0,
20c2df83	640	zero_ctor);
f10a04c0 DG	641	if (! huge_pgtable_cache)
	642	panic("hugetlbpage_init(): could not create hugepte cache\n");
	643
	644	return 0;
	645	}
	646
	647	module_init(hugetlbpage_init);