[deliverable/linux.git] / arch / tile / mm / pgtable.c

/*
 * Copyright 2010 Tilera Corporation. All Rights Reserved.
 *
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation, version 2.
 *
 *   This program is distributed in the hope that it will be useful, but
 *   WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 *   NON INFRINGEMENT.  See the GNU General Public License for
 *   more details.
 */

#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/io.h>
#include <linux/vmalloc.h>
#include <linux/smp.h>

#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/homecache.h>

#define K(x) ((x) << (PAGE_SHIFT-10))

/*
 * The normal show_free_areas() is too verbose on Tile, with dozens
 * of processors and often four NUMA zones each with high and lowmem.
 */
void show_mem(unsigned int filter)
{
	struct zone *zone;

	pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu"
	       " free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu"
	       " pagecache:%lu swap:%lu\n",
	       (global_page_state(NR_ACTIVE_ANON) +
		global_page_state(NR_ACTIVE_FILE)),
	       (global_page_state(NR_INACTIVE_ANON) +
		global_page_state(NR_INACTIVE_FILE)),
	       global_page_state(NR_FILE_DIRTY),
	       global_page_state(NR_WRITEBACK),
	       global_page_state(NR_UNSTABLE_NFS),
	       global_page_state(NR_FREE_PAGES),
	       (global_page_state(NR_SLAB_RECLAIMABLE) +
		global_page_state(NR_SLAB_UNRECLAIMABLE)),
	       global_page_state(NR_FILE_MAPPED),
	       global_page_state(NR_PAGETABLE),
	       global_page_state(NR_BOUNCE),
	       global_page_state(NR_FILE_PAGES),
	       get_nr_swap_pages());

	for_each_zone(zone) {
		unsigned long flags, order, total = 0, largest_order = -1;

		if (!populated_zone(zone))
			continue;

		spin_lock_irqsave(&zone->lock, flags);
		for (order = 0; order < MAX_ORDER; order++) {
			int nr = zone->free_area[order].nr_free;
			total += nr << order;
			if (nr)
				largest_order = order;
		}
		spin_unlock_irqrestore(&zone->lock, flags);
		pr_err("Node %d %7s: %lukB (largest %luKb)\n",
		       zone_to_nid(zone), zone->name,
		       K(total), largest_order ? K(1UL) << largest_order : 0);
	}
}

/**
 * shatter_huge_page() - ensure a given address is mapped by a small page.
 *
 * This function converts a huge PTE mapping kernel LOWMEM into a bunch
 * of small PTEs with the same caching.  No cache flush required, but we
 * must do a global TLB flush.
 *
 * Any caller that wishes to modify a kernel mapping that might
 * have been made with a huge page should call this function,
 * since doing so properly avoids race conditions with installing the
 * newly-shattered page and then flushing all the TLB entries.
 *
 * @addr: Address at which to shatter any existing huge page.
 */
void shatter_huge_page(unsigned long addr)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	unsigned long flags = 0;  /* happy compiler */
#ifdef __PAGETABLE_PMD_FOLDED
	struct list_head *pos;
#endif

	/* Get a pointer to the pmd entry that we need to change. */
	addr &= HPAGE_MASK;
	BUG_ON(pgd_addr_invalid(addr));
	BUG_ON(addr < PAGE_OFFSET);  /* only for kernel LOWMEM */
	pgd = swapper_pg_dir + pgd_index(addr);
	pud = pud_offset(pgd, addr);
	BUG_ON(!pud_present(*pud));
	pmd = pmd_offset(pud, addr);
	BUG_ON(!pmd_present(*pmd));
	if (!pmd_huge_page(*pmd))
		return;

	spin_lock_irqsave(&init_mm.page_table_lock, flags);
	if (!pmd_huge_page(*pmd)) {
		/* Lost the race to convert the huge page. */
		spin_unlock_irqrestore(&init_mm.page_table_lock, flags);
		return;
	}

	/* Shatter the huge page into the preallocated L2 page table. */
	pmd_populate_kernel(&init_mm, pmd, get_prealloc_pte(pmd_pfn(*pmd)));

#ifdef __PAGETABLE_PMD_FOLDED
	/* Walk every pgd on the system and update the pmd there. */
	spin_lock(&pgd_lock);
	list_for_each(pos, &pgd_list) {
		pmd_t *copy_pmd;
		pgd = list_to_pgd(pos) + pgd_index(addr);
		pud = pud_offset(pgd, addr);
		copy_pmd = pmd_offset(pud, addr);
		__set_pmd(copy_pmd, *pmd);
	}
	spin_unlock(&pgd_lock);
#endif

	/* Tell every cpu to notice the change. */
	flush_remote(0, 0, NULL, addr, HPAGE_SIZE, HPAGE_SIZE,
		     cpu_possible_mask, NULL, 0);

	/* Hold the lock until the TLB flush is finished to avoid races. */
	spin_unlock_irqrestore(&init_mm.page_table_lock, flags);
}

/*
 * List of all pgd's needed so it can invalidate entries in both cached
 * and uncached pgd's. This is essentially codepath-based locking
 * against pageattr.c; it is the unique case in which a valid change
 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
 * vmalloc faults work because attached pagetables are never freed.
 *
 * The lock is always taken with interrupts disabled, unlike on x86
 * and other platforms, because we need to take the lock in
 * shatter_huge_page(), which may be called from an interrupt context.
 * We are not at risk from the tlbflush IPI deadlock that was seen on
 * x86, since we use the flush_remote() API to have the hypervisor do
 * the TLB flushes regardless of irq disabling.
 */
DEFINE_SPINLOCK(pgd_lock);
LIST_HEAD(pgd_list);

static inline void pgd_list_add(pgd_t *pgd)
{
	list_add(pgd_to_list(pgd), &pgd_list);
}

static inline void pgd_list_del(pgd_t *pgd)
{
	list_del(pgd_to_list(pgd));
}

#define KERNEL_PGD_INDEX_START pgd_index(PAGE_OFFSET)
#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_INDEX_START)

static void pgd_ctor(pgd_t *pgd)
{
	unsigned long flags;

	memset(pgd, 0, KERNEL_PGD_INDEX_START*sizeof(pgd_t));
	spin_lock_irqsave(&pgd_lock, flags);

#ifndef __tilegx__
	/*
	 * Check that the user interrupt vector has no L2.
	 * It never should for the swapper, and new page tables
	 * should always start with an empty user interrupt vector.
	 */
	BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
#endif

	memcpy(pgd + KERNEL_PGD_INDEX_START,
	       swapper_pg_dir + KERNEL_PGD_INDEX_START,
	       KERNEL_PGD_PTRS * sizeof(pgd_t));

	pgd_list_add(pgd);
	spin_unlock_irqrestore(&pgd_lock, flags);
}

static void pgd_dtor(pgd_t *pgd)
{
	unsigned long flags; /* can be called from interrupt context */

	spin_lock_irqsave(&pgd_lock, flags);
	pgd_list_del(pgd);
	spin_unlock_irqrestore(&pgd_lock, flags);
}

pgd_t *pgd_alloc(struct mm_struct *mm)
{
	pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
	if (pgd)
		pgd_ctor(pgd);
	return pgd;
}

void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
	pgd_dtor(pgd);
	kmem_cache_free(pgd_cache, pgd);
}


#define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER)

struct page *pgtable_alloc_one(struct mm_struct *mm, unsigned long address,
			       int order)
{
	gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO;
	struct page *p;
	int i;

	p = alloc_pages(flags, L2_USER_PGTABLE_ORDER);
	if (p == NULL)
		return NULL;

	if (!pgtable_page_ctor(p)) {
		__free_pages(p, L2_USER_PGTABLE_ORDER);
		return NULL;
	}

	/*
	 * Make every page have a page_count() of one, not just the first.
	 * We don't use __GFP_COMP since it doesn't look like it works
	 * correctly with tlb_remove_page().
	 */
	for (i = 1; i < order; ++i) {
		init_page_count(p+i);
		inc_zone_page_state(p+i, NR_PAGETABLE);
	}

	return p;
}

/*
 * Free page immediately (used in __pte_alloc if we raced with another
 * process).  We have to correct whatever pte_alloc_one() did before
 * returning the pages to the allocator.
 */
void pgtable_free(struct mm_struct *mm, struct page *p, int order)
{
	int i;

	pgtable_page_dtor(p);
	__free_page(p);

	for (i = 1; i < order; ++i) {
		__free_page(p+i);
		dec_zone_page_state(p+i, NR_PAGETABLE);
	}
}

void __pgtable_free_tlb(struct mmu_gather *tlb, struct page *pte,
			unsigned long address, int order)
{
	int i;

	pgtable_page_dtor(pte);
	tlb_remove_page(tlb, pte);

	for (i = 1; i < order; ++i) {
		tlb_remove_page(tlb, pte + i);
		dec_zone_page_state(pte + i, NR_PAGETABLE);
	}
}

#ifndef __tilegx__

/*
 * FIXME: needs to be atomic vs hypervisor writes.  For now we make the
 * window of vulnerability a bit smaller by doing an unlocked 8-bit update.
 */
int ptep_test_and_clear_young(struct vm_area_struct *vma,
			      unsigned long addr, pte_t *ptep)
{
#if HV_PTE_INDEX_ACCESSED < 8 || HV_PTE_INDEX_ACCESSED >= 16
# error Code assumes HV_PTE "accessed" bit in second byte
#endif
	u8 *tmp = (u8 *)ptep;
	u8 second_byte = tmp[1];
	if (!(second_byte & (1 << (HV_PTE_INDEX_ACCESSED - 8))))
		return 0;
	tmp[1] = second_byte & ~(1 << (HV_PTE_INDEX_ACCESSED - 8));
	return 1;
}

/*
 * This implementation is atomic vs hypervisor writes, since the hypervisor
 * always writes the low word (where "accessed" and "dirty" are) and this
 * routine only writes the high word.
 */
void ptep_set_wrprotect(struct mm_struct *mm,
			unsigned long addr, pte_t *ptep)
{
#if HV_PTE_INDEX_WRITABLE < 32
# error Code assumes HV_PTE "writable" bit in high word
#endif
	u32 *tmp = (u32 *)ptep;
	tmp[1] = tmp[1] & ~(1 << (HV_PTE_INDEX_WRITABLE - 32));
}

#endif

/*
 * Return a pointer to the PTE that corresponds to the given
 * address in the given page table.  A NULL page table just uses
 * the standard kernel page table; the preferred API in this case
 * is virt_to_kpte().
 *
 * The returned pointer can point to a huge page in other levels
 * of the page table than the bottom, if the huge page is present
 * in the page table.  For bottom-level PTEs, the returned pointer
 * can point to a PTE that is either present or not.
 */
pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	if (pgd_addr_invalid(addr))
		return NULL;

	pgd = mm ? pgd_offset(mm, addr) : swapper_pg_dir + pgd_index(addr);
	pud = pud_offset(pgd, addr);
	if (!pud_present(*pud))
		return NULL;
	if (pud_huge_page(*pud))
		return (pte_t *)pud;
	pmd = pmd_offset(pud, addr);
	if (!pmd_present(*pmd))
		return NULL;
	if (pmd_huge_page(*pmd))
		return (pte_t *)pmd;
	return pte_offset_kernel(pmd, addr);
}
EXPORT_SYMBOL(virt_to_pte);

pte_t *virt_to_kpte(unsigned long kaddr)
{
	BUG_ON(kaddr < PAGE_OFFSET);
	return virt_to_pte(NULL, kaddr);
}
EXPORT_SYMBOL(virt_to_kpte);

pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu)
{
	unsigned int width = smp_width;
	int x = cpu % width;
	int y = cpu / width;
	BUG_ON(y >= smp_height);
	BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
	BUG_ON(cpu < 0 || cpu >= NR_CPUS);
	BUG_ON(!cpu_is_valid_lotar(cpu));
	return hv_pte_set_lotar(prot, HV_XY_TO_LOTAR(x, y));
}

int get_remote_cache_cpu(pgprot_t prot)
{
	HV_LOTAR lotar = hv_pte_get_lotar(prot);
	int x = HV_LOTAR_X(lotar);
	int y = HV_LOTAR_Y(lotar);
	BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
	return x + y * smp_width;
}

/*
 * Convert a kernel VA to a PA and homing information.
 */
int va_to_cpa_and_pte(void *va, unsigned long long *cpa, pte_t *pte)
{
	struct page *page = virt_to_page(va);
	pte_t null_pte = { 0 };

	*cpa = __pa(va);

	/* Note that this is not writing a page table, just returning a pte. */
	*pte = pte_set_home(null_pte, page_home(page));

	return 0; /* return non-zero if not hfh? */
}
EXPORT_SYMBOL(va_to_cpa_and_pte);

void __set_pte(pte_t *ptep, pte_t pte)
{
#ifdef __tilegx__
	*ptep = pte;
#else
# if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
#  error Must write the present and migrating bits last
# endif
	if (pte_present(pte)) {
		((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
		barrier();
		((u32 *)ptep)[0] = (u32)(pte_val(pte));
	} else {
		((u32 *)ptep)[0] = (u32)(pte_val(pte));
		barrier();
		((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
	}
#endif /* __tilegx__ */
}

void set_pte(pte_t *ptep, pte_t pte)
{
	if (pte_present(pte) &&
	    (!CHIP_HAS_MMIO() || hv_pte_get_mode(pte) != HV_PTE_MODE_MMIO)) {
		/* The PTE actually references physical memory. */
		unsigned long pfn = pte_pfn(pte);
		if (pfn_valid(pfn)) {
			/* Update the home of the PTE from the struct page. */
			pte = pte_set_home(pte, page_home(pfn_to_page(pfn)));
		} else if (hv_pte_get_mode(pte) == 0) {
			/* remap_pfn_range(), etc, must supply PTE mode. */
			panic("set_pte(): out-of-range PFN and mode 0\n");
		}
	}

	__set_pte(ptep, pte);
}

/* Can this mm load a PTE with cached_priority set? */
static inline int mm_is_priority_cached(struct mm_struct *mm)
{
	return mm->context.priority_cached != 0;
}

/*
 * Add a priority mapping to an mm_context and
 * notify the hypervisor if this is the first one.
 */
void start_mm_caching(struct mm_struct *mm)
{
	if (!mm_is_priority_cached(mm)) {
		mm->context.priority_cached = -1UL;
		hv_set_caching(-1UL);
	}
}

/*
 * Validate and return the priority_cached flag.  We know if it's zero
 * that we don't need to scan, since we immediately set it non-zero
 * when we first consider a MAP_CACHE_PRIORITY mapping.
 *
 * We only _try_ to acquire the mmap_sem semaphore; if we can't acquire it,
 * since we're in an interrupt context (servicing switch_mm) we don't
 * worry about it and don't unset the "priority_cached" field.
 * Presumably we'll come back later and have more luck and clear
 * the value then; for now we'll just keep the cache marked for priority.
 */
static unsigned long update_priority_cached(struct mm_struct *mm)
{
	if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) {
		struct vm_area_struct *vm;
		for (vm = mm->mmap; vm; vm = vm->vm_next) {
			if (hv_pte_get_cached_priority(vm->vm_page_prot))
				break;
		}
		if (vm == NULL)
			mm->context.priority_cached = 0;
		up_write(&mm->mmap_sem);
	}
	return mm->context.priority_cached;
}

/* Set caching correctly for an mm that we are switching to. */
void check_mm_caching(struct mm_struct *prev, struct mm_struct *next)
{
	if (!mm_is_priority_cached(next)) {
		/*
		 * If the new mm doesn't use priority caching, just see if we
		 * need the hv_set_caching(), or can assume it's already zero.
		 */
		if (mm_is_priority_cached(prev))
			hv_set_caching(0);
	} else {
		hv_set_caching(update_priority_cached(next));
	}
}

#if CHIP_HAS_MMIO()

/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
			   pgprot_t home)
{
	void *addr;
	struct vm_struct *area;
	unsigned long offset, last_addr;
	pgprot_t pgprot;

	/* Don't allow wraparound or zero size */
	last_addr = phys_addr + size - 1;
	if (!size || last_addr < phys_addr)
		return NULL;

	/* Create a read/write, MMIO VA mapping homed at the requested shim. */
	pgprot = PAGE_KERNEL;
	pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
	pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));

	/*
	 * Mappings have to be page-aligned
	 */
	offset = phys_addr & ~PAGE_MASK;
	phys_addr &= PAGE_MASK;
	size = PAGE_ALIGN(last_addr+1) - phys_addr;

	/*
	 * Ok, go for it..
	 */
	area = get_vm_area(size, VM_IOREMAP /* | other flags? */);
	if (!area)
		return NULL;
	area->phys_addr = phys_addr;
	addr = area->addr;
	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
			       phys_addr, pgprot)) {
		free_vm_area(area);
		return NULL;
	}
	return (__force void __iomem *) (offset + (char *)addr);
}
EXPORT_SYMBOL(ioremap_prot);

/* Unmap an MMIO VA mapping. */
void iounmap(volatile void __iomem *addr_in)
{
	volatile void __iomem *addr = (volatile void __iomem *)
		(PAGE_MASK & (unsigned long __force)addr_in);
#if 1
	vunmap((void * __force)addr);
#else
	/* x86 uses this complicated flow instead of vunmap().  Is
	 * there any particular reason we should do the same? */
	struct vm_struct *p, *o;

	/* Use the vm area unlocked, assuming the caller
	   ensures there isn't another iounmap for the same address
	   in parallel. Reuse of the virtual address is prevented by
	   leaving it in the global lists until we're done with it.
	   cpa takes care of the direct mappings. */
	p = find_vm_area((void *)addr);

	if (!p) {
		pr_err("iounmap: bad address %p\n", addr);
		dump_stack();
		return;
	}

	/* Finally remove it */
	o = remove_vm_area((void *)addr);
	BUG_ON(p != o || o == NULL);
	kfree(p);
#endif
}
EXPORT_SYMBOL(iounmap);

#endif /* CHIP_HAS_MMIO() */
Commit	Line	Data
867e359b CM	1	/*
	2	* Copyright 2010 Tilera Corporation. All Rights Reserved.
	3	*
	4	* This program is free software; you can redistribute it and/or
	5	* modify it under the terms of the GNU General Public License
	6	* as published by the Free Software Foundation, version 2.
	7	*
	8	* This program is distributed in the hope that it will be useful, but
	9	* WITHOUT ANY WARRANTY; without even the implied warranty of
	10	* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
	11	* NON INFRINGEMENT. See the GNU General Public License for
	12	* more details.
	13	*/
	14
	15	#include <linux/sched.h>
	16	#include <linux/kernel.h>
	17	#include <linux/errno.h>
	18	#include <linux/mm.h>
	19	#include <linux/swap.h>
867e359b CM	20	#include <linux/highmem.h>
	21	#include <linux/slab.h>
	22	#include <linux/pagemap.h>
	23	#include <linux/spinlock.h>
	24	#include <linux/cpumask.h>
	25	#include <linux/module.h>
	26	#include <linux/io.h>
	27	#include <linux/vmalloc.h>
	28	#include <linux/smp.h>
	29
867e359b CM	30	#include <asm/pgtable.h>
	31	#include <asm/pgalloc.h>
	32	#include <asm/fixmap.h>
	33	#include <asm/tlb.h>
	34	#include <asm/tlbflush.h>
	35	#include <asm/homecache.h>
	36
	37	#define K(x) ((x) << (PAGE_SHIFT-10))
	38
	39	/*
	40	* The normal show_free_areas() is too verbose on Tile, with dozens
	41	* of processors and often four NUMA zones each with high and lowmem.
	42	*/
b2b755b5	43	void show_mem(unsigned int filter)
867e359b CM	44	{
	45	struct zone *zone;
	46
0707ad30	47	pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu"
867e359b CM	48	" free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu"
	49	" pagecache:%lu swap:%lu\n",
	50	(global_page_state(NR_ACTIVE_ANON) +
	51	global_page_state(NR_ACTIVE_FILE)),
	52	(global_page_state(NR_INACTIVE_ANON) +
	53	global_page_state(NR_INACTIVE_FILE)),
	54	global_page_state(NR_FILE_DIRTY),
	55	global_page_state(NR_WRITEBACK),
	56	global_page_state(NR_UNSTABLE_NFS),
	57	global_page_state(NR_FREE_PAGES),
	58	(global_page_state(NR_SLAB_RECLAIMABLE) +
	59	global_page_state(NR_SLAB_UNRECLAIMABLE)),
	60	global_page_state(NR_FILE_MAPPED),
	61	global_page_state(NR_PAGETABLE),
	62	global_page_state(NR_BOUNCE),
	63	global_page_state(NR_FILE_PAGES),
ec8acf20	64	get_nr_swap_pages());
867e359b CM	65
	66	for_each_zone(zone) {
	67	unsigned long flags, order, total = 0, largest_order = -1;
	68
	69	if (!populated_zone(zone))
	70	continue;
	71
867e359b CM	72	spin_lock_irqsave(&zone->lock, flags);
	73	for (order = 0; order < MAX_ORDER; order++) {
	74	int nr = zone->free_area[order].nr_free;
	75	total += nr << order;
	76	if (nr)
	77	largest_order = order;
	78	}
	79	spin_unlock_irqrestore(&zone->lock, flags);
0707ad30 CM	80	pr_err("Node %d %7s: %lukB (largest %luKb)\n",
0707ad30 CM	81	zone_to_nid(zone), zone->name,
867e359b CM	82	K(total), largest_order ? K(1UL) << largest_order : 0);
	83	}
	84	}
	85
76c567fb CM	86	/**
	87	* shatter_huge_page() - ensure a given address is mapped by a small page.
	88	*
	89	* This function converts a huge PTE mapping kernel LOWMEM into a bunch
	90	* of small PTEs with the same caching. No cache flush required, but we
	91	* must do a global TLB flush.
	92	*
	93	* Any caller that wishes to modify a kernel mapping that might
	94	* have been made with a huge page should call this function,
	95	* since doing so properly avoids race conditions with installing the
	96	* newly-shattered page and then flushing all the TLB entries.
	97	*
	98	* @addr: Address at which to shatter any existing huge page.
	99	*/
	100	void shatter_huge_page(unsigned long addr)
	101	{
	102	pgd_t *pgd;
	103	pud_t *pud;
	104	pmd_t *pmd;
	105	unsigned long flags = 0; /* happy compiler */
	106	#ifdef __PAGETABLE_PMD_FOLDED
	107	struct list_head *pos;
	108	#endif
	109
	110	/* Get a pointer to the pmd entry that we need to change. */
	111	addr &= HPAGE_MASK;
	112	BUG_ON(pgd_addr_invalid(addr));
	113	BUG_ON(addr < PAGE_OFFSET); /* only for kernel LOWMEM */
	114	pgd = swapper_pg_dir + pgd_index(addr);
	115	pud = pud_offset(pgd, addr);
	116	BUG_ON(!pud_present(*pud));
	117	pmd = pmd_offset(pud, addr);
	118	BUG_ON(!pmd_present(*pmd));
	119	if (!pmd_huge_page(*pmd))
	120	return;
	121
719ea79e	122	spin_lock_irqsave(&init_mm.page_table_lock, flags);
76c567fb CM	123	if (!pmd_huge_page(*pmd)) {
76c567fb CM	124	/* Lost the race to convert the huge page. */
719ea79e	125	spin_unlock_irqrestore(&init_mm.page_table_lock, flags);
76c567fb CM	126	return;
	127	}
	128
	129	/* Shatter the huge page into the preallocated L2 page table. */
8629470e	130	pmd_populate_kernel(&init_mm, pmd, get_prealloc_pte(pmd_pfn(*pmd)));
76c567fb CM	131
	132	#ifdef __PAGETABLE_PMD_FOLDED
	133	/* Walk every pgd on the system and update the pmd there. */
719ea79e	134	spin_lock(&pgd_lock);
76c567fb CM	135	list_for_each(pos, &pgd_list) {
	136	pmd_t *copy_pmd;
	137	pgd = list_to_pgd(pos) + pgd_index(addr);
	138	pud = pud_offset(pgd, addr);
	139	copy_pmd = pmd_offset(pud, addr);
	140	__set_pmd(copy_pmd, *pmd);
	141	}
719ea79e	142	spin_unlock(&pgd_lock);
76c567fb CM	143	#endif
	144
	145	/* Tell every cpu to notice the change. */
	146	flush_remote(0, 0, NULL, addr, HPAGE_SIZE, HPAGE_SIZE,
	147	cpu_possible_mask, NULL, 0);
	148
	149	/* Hold the lock until the TLB flush is finished to avoid races. */
719ea79e	150	spin_unlock_irqrestore(&init_mm.page_table_lock, flags);
76c567fb CM	151	}
76c567fb CM	152
867e359b CM	153	/*
	154	* List of all pgd's needed so it can invalidate entries in both cached
	155	* and uncached pgd's. This is essentially codepath-based locking
	156	* against pageattr.c; it is the unique case in which a valid change
	157	* of kernel pagetables can't be lazily synchronized by vmalloc faults.
	158	* vmalloc faults work because attached pagetables are never freed.
719ea79e CM	159	*
	160	* The lock is always taken with interrupts disabled, unlike on x86
	161	* and other platforms, because we need to take the lock in
	162	* shatter_huge_page(), which may be called from an interrupt context.
	163	* We are not at risk from the tlbflush IPI deadlock that was seen on
	164	* x86, since we use the flush_remote() API to have the hypervisor do
	165	* the TLB flushes regardless of irq disabling.
867e359b CM	166	*/
	167	DEFINE_SPINLOCK(pgd_lock);
	168	LIST_HEAD(pgd_list);
	169
	170	static inline void pgd_list_add(pgd_t *pgd)
	171	{
	172	list_add(pgd_to_list(pgd), &pgd_list);
	173	}
	174
	175	static inline void pgd_list_del(pgd_t *pgd)
	176	{
	177	list_del(pgd_to_list(pgd));
	178	}
	179
	180	#define KERNEL_PGD_INDEX_START pgd_index(PAGE_OFFSET)
	181	#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_INDEX_START)
	182
	183	static void pgd_ctor(pgd_t *pgd)
	184	{
	185	unsigned long flags;
	186
	187	memset(pgd, 0, KERNEL_PGD_INDEX_START*sizeof(pgd_t));
	188	spin_lock_irqsave(&pgd_lock, flags);
	189
	190	#ifndef __tilegx__
	191	/*
	192	* Check that the user interrupt vector has no L2.
	193	* It never should for the swapper, and new page tables
	194	* should always start with an empty user interrupt vector.
	195	*/
	196	BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
	197	#endif
	198
76c567fb CM	199	memcpy(pgd + KERNEL_PGD_INDEX_START,
	200	swapper_pg_dir + KERNEL_PGD_INDEX_START,
	201	KERNEL_PGD_PTRS * sizeof(pgd_t));
867e359b CM	202
	203	pgd_list_add(pgd);
	204	spin_unlock_irqrestore(&pgd_lock, flags);
	205	}
	206
	207	static void pgd_dtor(pgd_t *pgd)
	208	{
	209	unsigned long flags; /* can be called from interrupt context */
	210
	211	spin_lock_irqsave(&pgd_lock, flags);
	212	pgd_list_del(pgd);
	213	spin_unlock_irqrestore(&pgd_lock, flags);
	214	}
	215
	216	pgd_t pgd_alloc(struct mm_struct mm)
	217	{
	218	pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
	219	if (pgd)
	220	pgd_ctor(pgd);
	221	return pgd;
	222	}
	223
	224	void pgd_free(struct mm_struct mm, pgd_t pgd)
	225	{
	226	pgd_dtor(pgd);
	227	kmem_cache_free(pgd_cache, pgd);
	228	}
	229
	230
	231	#define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER)
	232
d5d14ed6 CM	233	struct page pgtable_alloc_one(struct mm_struct mm, unsigned long address,
d5d14ed6 CM	234	int order)
867e359b	235	{
76c567fb	236	gfp_t flags = GFP_KERNEL\|__GFP_REPEAT\|__GFP_ZERO;
867e359b	237	struct page *p;
76c567fb	238	int i;
867e359b	239
867e359b CM	240	p = alloc_pages(flags, L2_USER_PGTABLE_ORDER);
	241	if (p == NULL)
	242	return NULL;
	243
76b3aec3 KS	244	if (!pgtable_page_ctor(p)) {
	245	__free_pages(p, L2_USER_PGTABLE_ORDER);
	246	return NULL;
	247	}
	248
76c567fb CM	249	/*
	250	* Make every page have a page_count() of one, not just the first.
	251	* We don't use __GFP_COMP since it doesn't look like it works
	252	* correctly with tlb_remove_page().
	253	*/
d5d14ed6	254	for (i = 1; i < order; ++i) {
76c567fb CM	255	init_page_count(p+i);
	256	inc_zone_page_state(p+i, NR_PAGETABLE);
	257	}
76c567fb	258
867e359b CM	259	return p;
	260	}
	261
	262	/*
	263	* Free page immediately (used in __pte_alloc if we raced with another
	264	* process). We have to correct whatever pte_alloc_one() did before
	265	* returning the pages to the allocator.
	266	*/
d5d14ed6	267	void pgtable_free(struct mm_struct mm, struct page p, int order)
867e359b	268	{
76c567fb CM	269	int i;
76c567fb CM	270
867e359b	271	pgtable_page_dtor(p);
76c567fb CM	272	__free_page(p);
76c567fb CM	273
d5d14ed6	274	for (i = 1; i < order; ++i) {
76c567fb CM	275	__free_page(p+i);
	276	dec_zone_page_state(p+i, NR_PAGETABLE);
	277	}
867e359b CM	278	}
867e359b CM	279
d5d14ed6 CM	280	void __pgtable_free_tlb(struct mmu_gather tlb, struct page pte,
d5d14ed6 CM	281	unsigned long address, int order)
867e359b CM	282	{
	283	int i;
	284
	285	pgtable_page_dtor(pte);
76c567fb CM	286	tlb_remove_page(tlb, pte);
76c567fb CM	287
d5d14ed6	288	for (i = 1; i < order; ++i) {
342d87ef	289	tlb_remove_page(tlb, pte + i);
76c567fb CM	290	dec_zone_page_state(pte + i, NR_PAGETABLE);
76c567fb CM	291	}
867e359b CM	292	}
	293
	294	#ifndef __tilegx__
	295
	296	/*
	297	* FIXME: needs to be atomic vs hypervisor writes. For now we make the
	298	* window of vulnerability a bit smaller by doing an unlocked 8-bit update.
	299	*/
	300	int ptep_test_and_clear_young(struct vm_area_struct *vma,
	301	unsigned long addr, pte_t *ptep)
	302	{
	303	#if HV_PTE_INDEX_ACCESSED < 8 \|\| HV_PTE_INDEX_ACCESSED >= 16
	304	# error Code assumes HV_PTE "accessed" bit in second byte
	305	#endif
	306	u8 tmp = (u8 )ptep;
	307	u8 second_byte = tmp[1];
	308	if (!(second_byte & (1 << (HV_PTE_INDEX_ACCESSED - 8))))
	309	return 0;
	310	tmp[1] = second_byte & ~(1 << (HV_PTE_INDEX_ACCESSED - 8));
	311	return 1;
	312	}
	313
	314	/*
	315	* This implementation is atomic vs hypervisor writes, since the hypervisor
	316	* always writes the low word (where "accessed" and "dirty" are) and this
	317	* routine only writes the high word.
	318	*/
	319	void ptep_set_wrprotect(struct mm_struct *mm,
	320	unsigned long addr, pte_t *ptep)
	321	{
	322	#if HV_PTE_INDEX_WRITABLE < 32
	323	# error Code assumes HV_PTE "writable" bit in high word
	324	#endif
	325	u32 tmp = (u32 )ptep;
	326	tmp[1] = tmp[1] & ~(1 << (HV_PTE_INDEX_WRITABLE - 32));
	327	}
	328
	329	#endif
	330
640710a3 CM	331	/*
	332	* Return a pointer to the PTE that corresponds to the given
	333	* address in the given page table. A NULL page table just uses
	334	* the standard kernel page table; the preferred API in this case
	335	* is virt_to_kpte().
	336	*
	337	* The returned pointer can point to a huge page in other levels
	338	* of the page table than the bottom, if the huge page is present
	339	* in the page table. For bottom-level PTEs, the returned pointer
	340	* can point to a PTE that is either present or not.
	341	*/
867e359b CM	342	pte_t virt_to_pte(struct mm_struct mm, unsigned long addr)
	343	{
	344	pgd_t *pgd;
	345	pud_t *pud;
	346	pmd_t *pmd;
	347
	348	if (pgd_addr_invalid(addr))
	349	return NULL;
	350
	351	pgd = mm ? pgd_offset(mm, addr) : swapper_pg_dir + pgd_index(addr);
	352	pud = pud_offset(pgd, addr);
	353	if (!pud_present(*pud))
	354	return NULL;
a718e10c CM	355	if (pud_huge_page(*pud))
a718e10c CM	356	return (pte_t *)pud;
867e359b	357	pmd = pmd_offset(pud, addr);
867e359b CM	358	if (!pmd_present(*pmd))
867e359b CM	359	return NULL;
640710a3 CM	360	if (pmd_huge_page(*pmd))
640710a3 CM	361	return (pte_t *)pmd;
867e359b CM	362	return pte_offset_kernel(pmd, addr);
867e359b CM	363	}
a718e10c	364	EXPORT_SYMBOL(virt_to_pte);
867e359b	365
640710a3 CM	366	pte_t *virt_to_kpte(unsigned long kaddr)
	367	{
	368	BUG_ON(kaddr < PAGE_OFFSET);
	369	return virt_to_pte(NULL, kaddr);
	370	}
	371	EXPORT_SYMBOL(virt_to_kpte);
	372
867e359b CM	373	pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu)
	374	{
	375	unsigned int width = smp_width;
	376	int x = cpu % width;
	377	int y = cpu / width;
	378	BUG_ON(y >= smp_height);
	379	BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
	380	BUG_ON(cpu < 0 \|\| cpu >= NR_CPUS);
	381	BUG_ON(!cpu_is_valid_lotar(cpu));
	382	return hv_pte_set_lotar(prot, HV_XY_TO_LOTAR(x, y));
	383	}
	384
	385	int get_remote_cache_cpu(pgprot_t prot)
	386	{
	387	HV_LOTAR lotar = hv_pte_get_lotar(prot);
	388	int x = HV_LOTAR_X(lotar);
	389	int y = HV_LOTAR_Y(lotar);
	390	BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
	391	return x + y * smp_width;
	392	}
	393
76c567fb CM	394	/*
	395	* Convert a kernel VA to a PA and homing information.
	396	*/
	397	int va_to_cpa_and_pte(void va, unsigned long long cpa, pte_t *pte)
867e359b	398	{
76c567fb CM	399	struct page *page = virt_to_page(va);
76c567fb CM	400	pte_t null_pte = { 0 };
867e359b	401
76c567fb CM	402	*cpa = __pa(va);
	403
	404	/* Note that this is not writing a page table, just returning a pte. */
	405	*pte = pte_set_home(null_pte, page_home(page));
867e359b	406
76c567fb CM	407	return 0; /* return non-zero if not hfh? */
	408	}
	409	EXPORT_SYMBOL(va_to_cpa_and_pte);
	410
	411	void __set_pte(pte_t *ptep, pte_t pte)
	412	{
867e359b CM	413	#ifdef __tilegx__
	414	*ptep = pte;
	415	#else
76c567fb CM	416	# if HV_PTE_INDEX_PRESENT >= 32 \|\| HV_PTE_INDEX_MIGRATING >= 32
	417	# error Must write the present and migrating bits last
	418	# endif
	419	if (pte_present(pte)) {
	420	((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
	421	barrier();
	422	((u32 *)ptep)[0] = (u32)(pte_val(pte));
	423	} else {
	424	((u32 *)ptep)[0] = (u32)(pte_val(pte));
	425	barrier();
	426	((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
	427	}
	428	#endif /* __tilegx__ */
	429	}
	430
	431	void set_pte(pte_t *ptep, pte_t pte)
	432	{
12400f1f CM	433	if (pte_present(pte) &&
	434	(!CHIP_HAS_MMIO() \|\| hv_pte_get_mode(pte) != HV_PTE_MODE_MMIO)) {
	435	/* The PTE actually references physical memory. */
	436	unsigned long pfn = pte_pfn(pte);
	437	if (pfn_valid(pfn)) {
	438	/* Update the home of the PTE from the struct page. */
	439	pte = pte_set_home(pte, page_home(pfn_to_page(pfn)));
	440	} else if (hv_pte_get_mode(pte) == 0) {
	441	/* remap_pfn_range(), etc, must supply PTE mode. */
	442	panic("set_pte(): out-of-range PFN and mode 0\n");
	443	}
	444	}
76c567fb CM	445
76c567fb CM	446	__set_pte(ptep, pte);
867e359b CM	447	}
	448
	449	/* Can this mm load a PTE with cached_priority set? */
	450	static inline int mm_is_priority_cached(struct mm_struct *mm)
	451	{
d5d14ed6	452	return mm->context.priority_cached != 0;
867e359b CM	453	}
	454
	455	/*
	456	* Add a priority mapping to an mm_context and
	457	* notify the hypervisor if this is the first one.
	458	*/
	459	void start_mm_caching(struct mm_struct *mm)
	460	{
	461	if (!mm_is_priority_cached(mm)) {
d5d14ed6 CM	462	mm->context.priority_cached = -1UL;
d5d14ed6 CM	463	hv_set_caching(-1UL);
867e359b CM	464	}
	465	}
	466
	467	/*
	468	* Validate and return the priority_cached flag. We know if it's zero
	469	* that we don't need to scan, since we immediately set it non-zero
	470	* when we first consider a MAP_CACHE_PRIORITY mapping.
	471	*
	472	* We only _try_ to acquire the mmap_sem semaphore; if we can't acquire it,
	473	* since we're in an interrupt context (servicing switch_mm) we don't
	474	* worry about it and don't unset the "priority_cached" field.
	475	* Presumably we'll come back later and have more luck and clear
	476	* the value then; for now we'll just keep the cache marked for priority.
	477	*/
d5d14ed6	478	static unsigned long update_priority_cached(struct mm_struct *mm)
867e359b CM	479	{
	480	if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) {
	481	struct vm_area_struct *vm;
	482	for (vm = mm->mmap; vm; vm = vm->vm_next) {
	483	if (hv_pte_get_cached_priority(vm->vm_page_prot))
	484	break;
	485	}
	486	if (vm == NULL)
	487	mm->context.priority_cached = 0;
	488	up_write(&mm->mmap_sem);
	489	}
	490	return mm->context.priority_cached;
	491	}
	492
	493	/* Set caching correctly for an mm that we are switching to. */
	494	void check_mm_caching(struct mm_struct prev, struct mm_struct next)
	495	{
	496	if (!mm_is_priority_cached(next)) {
	497	/*
	498	* If the new mm doesn't use priority caching, just see if we
	499	* need the hv_set_caching(), or can assume it's already zero.
	500	*/
	501	if (mm_is_priority_cached(prev))
	502	hv_set_caching(0);
	503	} else {
	504	hv_set_caching(update_priority_cached(next));
	505	}
	506	}
	507
	508	#if CHIP_HAS_MMIO()
	509
	510	/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
	511	void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
	512	pgprot_t home)
	513	{
	514	void *addr;
	515	struct vm_struct *area;
	516	unsigned long offset, last_addr;
	517	pgprot_t pgprot;
	518
	519	/* Don't allow wraparound or zero size */
	520	last_addr = phys_addr + size - 1;
	521	if (!size \|\| last_addr < phys_addr)
	522	return NULL;
	523
	524	/* Create a read/write, MMIO VA mapping homed at the requested shim. */
	525	pgprot = PAGE_KERNEL;
	526	pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
	527	pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
	528
	529	/*
	530	* Mappings have to be page-aligned
	531	*/
	532	offset = phys_addr & ~PAGE_MASK;
	533	phys_addr &= PAGE_MASK;
	534	size = PAGE_ALIGN(last_addr+1) - phys_addr;
	535
	536	/*
	537	* Ok, go for it..
	538	*/
	539	area = get_vm_area(size, VM_IOREMAP /* \| other flags? */);
	540	if (!area)
	541	return NULL;
	542	area->phys_addr = phys_addr;
543	addr = area->addr;
544	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
545	phys_addr, pgprot)) {
fad052dc	546	free_vm_area(area);
867e359b CM	547	return NULL;
	548	}
	549	return (__force void __iomem ) (offset + (char )addr);
	550	}
	551	EXPORT_SYMBOL(ioremap_prot);
	552
867e359b CM	553	/* Unmap an MMIO VA mapping. */
	554	void iounmap(volatile void __iomem *addr_in)
	555	{
	556	volatile void __iomem addr = (volatile void __iomem )
	557	(PAGE_MASK & (unsigned long __force)addr_in);
	558	#if 1
	559	vunmap((void * __force)addr);
	560	#else
	561	/* x86 uses this complicated flow instead of vunmap(). Is
	562	* there any particular reason we should do the same? */
	563	struct vm_struct p, o;
	564
	565	/* Use the vm area unlocked, assuming the caller
	566	ensures there isn't another iounmap for the same address
	567	in parallel. Reuse of the virtual address is prevented by
	568	leaving it in the global lists until we're done with it.
	569	cpa takes care of the direct mappings. */
ef932473	570	p = find_vm_area((void *)addr);
867e359b CM	571
867e359b CM	572	if (!p) {
0707ad30	573	pr_err("iounmap: bad address %p\n", addr);
867e359b CM	574	dump_stack();
	575	return;
	576	}
	577
	578	/* Finally remove it */
	579	o = remove_vm_area((void *)addr);
	580	BUG_ON(p != o \|\| o == NULL);
	581	kfree(p);
	582	#endif
	583	}
	584	EXPORT_SYMBOL(iounmap);
	585
	586	#endif /* CHIP_HAS_MMIO() */