[deliverable/linux.git] / arch / tile / mm / pgtable.c

/*
 * Copyright 2010 Tilera Corporation. All Rights Reserved.
 *
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation, version 2.
 *
 *   This program is distributed in the hope that it will be useful, but
 *   WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 *   NON INFRINGEMENT.  See the GNU General Public License for
 *   more details.
 */

#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/io.h>
#include <linux/vmalloc.h>
#include <linux/smp.h>

#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/homecache.h>

#define K(x) ((x) << (PAGE_SHIFT-10))

/*
 * The normal show_free_areas() is too verbose on Tile, with dozens
 * of processors and often four NUMA zones each with high and lowmem.
 */
void show_mem(unsigned int filter)
{
	struct zone *zone;

	pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu"
	       " free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu"
	       " pagecache:%lu swap:%lu\n",
	       (global_page_state(NR_ACTIVE_ANON) +
		global_page_state(NR_ACTIVE_FILE)),
	       (global_page_state(NR_INACTIVE_ANON) +
		global_page_state(NR_INACTIVE_FILE)),
	       global_page_state(NR_FILE_DIRTY),
	       global_page_state(NR_WRITEBACK),
	       global_page_state(NR_UNSTABLE_NFS),
	       global_page_state(NR_FREE_PAGES),
	       (global_page_state(NR_SLAB_RECLAIMABLE) +
		global_page_state(NR_SLAB_UNRECLAIMABLE)),
	       global_page_state(NR_FILE_MAPPED),
	       global_page_state(NR_PAGETABLE),
	       global_page_state(NR_BOUNCE),
	       global_page_state(NR_FILE_PAGES),
	       nr_swap_pages);

	for_each_zone(zone) {
		unsigned long flags, order, total = 0, largest_order = -1;

		if (!populated_zone(zone))
			continue;

		spin_lock_irqsave(&zone->lock, flags);
		for (order = 0; order < MAX_ORDER; order++) {
			int nr = zone->free_area[order].nr_free;
			total += nr << order;
			if (nr)
				largest_order = order;
		}
		spin_unlock_irqrestore(&zone->lock, flags);
		pr_err("Node %d %7s: %lukB (largest %luKb)\n",
		       zone_to_nid(zone), zone->name,
		       K(total), largest_order ? K(1UL) << largest_order : 0);
	}
}

/*
 * Associate a virtual page frame with a given physical page frame
 * and protection flags for that frame.
 */
static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pgd = swapper_pg_dir + pgd_index(vaddr);
	if (pgd_none(*pgd)) {
		BUG();
		return;
	}
	pud = pud_offset(pgd, vaddr);
	if (pud_none(*pud)) {
		BUG();
		return;
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		BUG();
		return;
	}
	pte = pte_offset_kernel(pmd, vaddr);
	/* <pfn,flags> stored as-is, to permit clearing entries */
	set_pte(pte, pfn_pte(pfn, flags));

	/*
	 * It's enough to flush this one mapping.
	 * This appears conservative since it is only called
	 * from __set_fixmap.
	 */
	local_flush_tlb_page(NULL, vaddr, PAGE_SIZE);
}

void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
{
	unsigned long address = __fix_to_virt(idx);

	if (idx >= __end_of_fixed_addresses) {
		BUG();
		return;
	}
	set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
}

#if defined(CONFIG_HIGHPTE)
pte_t *_pte_offset_map(pmd_t *dir, unsigned long address)
{
	pte_t *pte = kmap_atomic(pmd_page(*dir)) +
		(pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK;
	return &pte[pte_index(address)];
}
#endif

/**
 * shatter_huge_page() - ensure a given address is mapped by a small page.
 *
 * This function converts a huge PTE mapping kernel LOWMEM into a bunch
 * of small PTEs with the same caching.  No cache flush required, but we
 * must do a global TLB flush.
 *
 * Any caller that wishes to modify a kernel mapping that might
 * have been made with a huge page should call this function,
 * since doing so properly avoids race conditions with installing the
 * newly-shattered page and then flushing all the TLB entries.
 *
 * @addr: Address at which to shatter any existing huge page.
 */
void shatter_huge_page(unsigned long addr)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	unsigned long flags = 0;  /* happy compiler */
#ifdef __PAGETABLE_PMD_FOLDED
	struct list_head *pos;
#endif

	/* Get a pointer to the pmd entry that we need to change. */
	addr &= HPAGE_MASK;
	BUG_ON(pgd_addr_invalid(addr));
	BUG_ON(addr < PAGE_OFFSET);  /* only for kernel LOWMEM */
	pgd = swapper_pg_dir + pgd_index(addr);
	pud = pud_offset(pgd, addr);
	BUG_ON(!pud_present(*pud));
	pmd = pmd_offset(pud, addr);
	BUG_ON(!pmd_present(*pmd));
	if (!pmd_huge_page(*pmd))
		return;

	/*
	 * Grab the pgd_lock, since we may need it to walk the pgd_list,
	 * and since we need some kind of lock here to avoid races.
	 */
	spin_lock_irqsave(&pgd_lock, flags);
	if (!pmd_huge_page(*pmd)) {
		/* Lost the race to convert the huge page. */
		spin_unlock_irqrestore(&pgd_lock, flags);
		return;
	}

	/* Shatter the huge page into the preallocated L2 page table. */
	pmd_populate_kernel(&init_mm, pmd,
			    get_prealloc_pte(pte_pfn(*(pte_t *)pmd)));

#ifdef __PAGETABLE_PMD_FOLDED
	/* Walk every pgd on the system and update the pmd there. */
	list_for_each(pos, &pgd_list) {
		pmd_t *copy_pmd;
		pgd = list_to_pgd(pos) + pgd_index(addr);
		pud = pud_offset(pgd, addr);
		copy_pmd = pmd_offset(pud, addr);
		__set_pmd(copy_pmd, *pmd);
	}
#endif

	/* Tell every cpu to notice the change. */
	flush_remote(0, 0, NULL, addr, HPAGE_SIZE, HPAGE_SIZE,
		     cpu_possible_mask, NULL, 0);

	/* Hold the lock until the TLB flush is finished to avoid races. */
	spin_unlock_irqrestore(&pgd_lock, flags);
}

/*
 * List of all pgd's needed so it can invalidate entries in both cached
 * and uncached pgd's. This is essentially codepath-based locking
 * against pageattr.c; it is the unique case in which a valid change
 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
 * vmalloc faults work because attached pagetables are never freed.
 * The locking scheme was chosen on the basis of manfred's
 * recommendations and having no core impact whatsoever.
 * -- wli
 */
DEFINE_SPINLOCK(pgd_lock);
LIST_HEAD(pgd_list);

static inline void pgd_list_add(pgd_t *pgd)
{
	list_add(pgd_to_list(pgd), &pgd_list);
}

static inline void pgd_list_del(pgd_t *pgd)
{
	list_del(pgd_to_list(pgd));
}

#define KERNEL_PGD_INDEX_START pgd_index(PAGE_OFFSET)
#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_INDEX_START)

static void pgd_ctor(pgd_t *pgd)
{
	unsigned long flags;

	memset(pgd, 0, KERNEL_PGD_INDEX_START*sizeof(pgd_t));
	spin_lock_irqsave(&pgd_lock, flags);

#ifndef __tilegx__
	/*
	 * Check that the user interrupt vector has no L2.
	 * It never should for the swapper, and new page tables
	 * should always start with an empty user interrupt vector.
	 */
	BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
#endif

	memcpy(pgd + KERNEL_PGD_INDEX_START,
	       swapper_pg_dir + KERNEL_PGD_INDEX_START,
	       KERNEL_PGD_PTRS * sizeof(pgd_t));

	pgd_list_add(pgd);
	spin_unlock_irqrestore(&pgd_lock, flags);
}

static void pgd_dtor(pgd_t *pgd)
{
	unsigned long flags; /* can be called from interrupt context */

	spin_lock_irqsave(&pgd_lock, flags);
	pgd_list_del(pgd);
	spin_unlock_irqrestore(&pgd_lock, flags);
}

pgd_t *pgd_alloc(struct mm_struct *mm)
{
	pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
	if (pgd)
		pgd_ctor(pgd);
	return pgd;
}

void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
	pgd_dtor(pgd);
	kmem_cache_free(pgd_cache, pgd);
}


#define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER)

struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
	gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO;
	struct page *p;
#if L2_USER_PGTABLE_ORDER > 0
	int i;
#endif

#ifdef CONFIG_HIGHPTE
	flags |= __GFP_HIGHMEM;
#endif

	p = alloc_pages(flags, L2_USER_PGTABLE_ORDER);
	if (p == NULL)
		return NULL;

#if L2_USER_PGTABLE_ORDER > 0
	/*
	 * Make every page have a page_count() of one, not just the first.
	 * We don't use __GFP_COMP since it doesn't look like it works
	 * correctly with tlb_remove_page().
	 */
	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
		init_page_count(p+i);
		inc_zone_page_state(p+i, NR_PAGETABLE);
	}
#endif

	pgtable_page_ctor(p);
	return p;
}

/*
 * Free page immediately (used in __pte_alloc if we raced with another
 * process).  We have to correct whatever pte_alloc_one() did before
 * returning the pages to the allocator.
 */
void pte_free(struct mm_struct *mm, struct page *p)
{
	int i;

	pgtable_page_dtor(p);
	__free_page(p);

	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
		__free_page(p+i);
		dec_zone_page_state(p+i, NR_PAGETABLE);
	}
}

void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
		    unsigned long address)
{
	int i;

	pgtable_page_dtor(pte);
	tlb_remove_page(tlb, pte);

	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
		tlb_remove_page(tlb, pte + i);
		dec_zone_page_state(pte + i, NR_PAGETABLE);
	}
}

#ifndef __tilegx__

/*
 * FIXME: needs to be atomic vs hypervisor writes.  For now we make the
 * window of vulnerability a bit smaller by doing an unlocked 8-bit update.
 */
int ptep_test_and_clear_young(struct vm_area_struct *vma,
			      unsigned long addr, pte_t *ptep)
{
#if HV_PTE_INDEX_ACCESSED < 8 || HV_PTE_INDEX_ACCESSED >= 16
# error Code assumes HV_PTE "accessed" bit in second byte
#endif
	u8 *tmp = (u8 *)ptep;
	u8 second_byte = tmp[1];
	if (!(second_byte & (1 << (HV_PTE_INDEX_ACCESSED - 8))))
		return 0;
	tmp[1] = second_byte & ~(1 << (HV_PTE_INDEX_ACCESSED - 8));
	return 1;
}

/*
 * This implementation is atomic vs hypervisor writes, since the hypervisor
 * always writes the low word (where "accessed" and "dirty" are) and this
 * routine only writes the high word.
 */
void ptep_set_wrprotect(struct mm_struct *mm,
			unsigned long addr, pte_t *ptep)
{
#if HV_PTE_INDEX_WRITABLE < 32
# error Code assumes HV_PTE "writable" bit in high word
#endif
	u32 *tmp = (u32 *)ptep;
	tmp[1] = tmp[1] & ~(1 << (HV_PTE_INDEX_WRITABLE - 32));
}

#endif

pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	if (pgd_addr_invalid(addr))
		return NULL;

	pgd = mm ? pgd_offset(mm, addr) : swapper_pg_dir + pgd_index(addr);
	pud = pud_offset(pgd, addr);
	if (!pud_present(*pud))
		return NULL;
	pmd = pmd_offset(pud, addr);
	if (pmd_huge_page(*pmd))
		return (pte_t *)pmd;
	if (!pmd_present(*pmd))
		return NULL;
	return pte_offset_kernel(pmd, addr);
}

pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu)
{
	unsigned int width = smp_width;
	int x = cpu % width;
	int y = cpu / width;
	BUG_ON(y >= smp_height);
	BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
	BUG_ON(cpu < 0 || cpu >= NR_CPUS);
	BUG_ON(!cpu_is_valid_lotar(cpu));
	return hv_pte_set_lotar(prot, HV_XY_TO_LOTAR(x, y));
}

int get_remote_cache_cpu(pgprot_t prot)
{
	HV_LOTAR lotar = hv_pte_get_lotar(prot);
	int x = HV_LOTAR_X(lotar);
	int y = HV_LOTAR_Y(lotar);
	BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
	return x + y * smp_width;
}

/*
 * Convert a kernel VA to a PA and homing information.
 */
int va_to_cpa_and_pte(void *va, unsigned long long *cpa, pte_t *pte)
{
	struct page *page = virt_to_page(va);
	pte_t null_pte = { 0 };

	*cpa = __pa(va);

	/* Note that this is not writing a page table, just returning a pte. */
	*pte = pte_set_home(null_pte, page_home(page));

	return 0; /* return non-zero if not hfh? */
}
EXPORT_SYMBOL(va_to_cpa_and_pte);

void __set_pte(pte_t *ptep, pte_t pte)
{
#ifdef __tilegx__
	*ptep = pte;
#else
# if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
#  error Must write the present and migrating bits last
# endif
	if (pte_present(pte)) {
		((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
		barrier();
		((u32 *)ptep)[0] = (u32)(pte_val(pte));
	} else {
		((u32 *)ptep)[0] = (u32)(pte_val(pte));
		barrier();
		((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
	}
#endif /* __tilegx__ */
}

void set_pte(pte_t *ptep, pte_t pte)
{
	struct page *page = pfn_to_page(pte_pfn(pte));

	/* Update the home of a PTE if necessary */
	pte = pte_set_home(pte, page_home(page));

	__set_pte(ptep, pte);
}

/* Can this mm load a PTE with cached_priority set? */
static inline int mm_is_priority_cached(struct mm_struct *mm)
{
	return mm->context.priority_cached;
}

/*
 * Add a priority mapping to an mm_context and
 * notify the hypervisor if this is the first one.
 */
void start_mm_caching(struct mm_struct *mm)
{
	if (!mm_is_priority_cached(mm)) {
		mm->context.priority_cached = -1U;
		hv_set_caching(-1U);
	}
}

/*
 * Validate and return the priority_cached flag.  We know if it's zero
 * that we don't need to scan, since we immediately set it non-zero
 * when we first consider a MAP_CACHE_PRIORITY mapping.
 *
 * We only _try_ to acquire the mmap_sem semaphore; if we can't acquire it,
 * since we're in an interrupt context (servicing switch_mm) we don't
 * worry about it and don't unset the "priority_cached" field.
 * Presumably we'll come back later and have more luck and clear
 * the value then; for now we'll just keep the cache marked for priority.
 */
static unsigned int update_priority_cached(struct mm_struct *mm)
{
	if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) {
		struct vm_area_struct *vm;
		for (vm = mm->mmap; vm; vm = vm->vm_next) {
			if (hv_pte_get_cached_priority(vm->vm_page_prot))
				break;
		}
		if (vm == NULL)
			mm->context.priority_cached = 0;
		up_write(&mm->mmap_sem);
	}
	return mm->context.priority_cached;
}

/* Set caching correctly for an mm that we are switching to. */
void check_mm_caching(struct mm_struct *prev, struct mm_struct *next)
{
	if (!mm_is_priority_cached(next)) {
		/*
		 * If the new mm doesn't use priority caching, just see if we
		 * need the hv_set_caching(), or can assume it's already zero.
		 */
		if (mm_is_priority_cached(prev))
			hv_set_caching(0);
	} else {
		hv_set_caching(update_priority_cached(next));
	}
}

#if CHIP_HAS_MMIO()

/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
			   pgprot_t home)
{
	void *addr;
	struct vm_struct *area;
	unsigned long offset, last_addr;
	pgprot_t pgprot;

	/* Don't allow wraparound or zero size */
	last_addr = phys_addr + size - 1;
	if (!size || last_addr < phys_addr)
		return NULL;

	/* Create a read/write, MMIO VA mapping homed at the requested shim. */
	pgprot = PAGE_KERNEL;
	pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
	pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));

	/*
	 * Mappings have to be page-aligned
	 */
	offset = phys_addr & ~PAGE_MASK;
	phys_addr &= PAGE_MASK;
	size = PAGE_ALIGN(last_addr+1) - phys_addr;

	/*
	 * Ok, go for it..
	 */
	area = get_vm_area(size, VM_IOREMAP /* | other flags? */);
	if (!area)
		return NULL;
	area->phys_addr = phys_addr;
	addr = area->addr;
	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
			       phys_addr, pgprot)) {
		remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
		return NULL;
	}
	return (__force void __iomem *) (offset + (char *)addr);
}
EXPORT_SYMBOL(ioremap_prot);

/* Map a PCI MMIO bus address into VA space. */
void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
{
	panic("ioremap for PCI MMIO is not supported");
}
EXPORT_SYMBOL(ioremap);

/* Unmap an MMIO VA mapping. */
void iounmap(volatile void __iomem *addr_in)
{
	volatile void __iomem *addr = (volatile void __iomem *)
		(PAGE_MASK & (unsigned long __force)addr_in);
#if 1
	vunmap((void * __force)addr);
#else
	/* x86 uses this complicated flow instead of vunmap().  Is
	 * there any particular reason we should do the same? */
	struct vm_struct *p, *o;

	/* Use the vm area unlocked, assuming the caller
	   ensures there isn't another iounmap for the same address
	   in parallel. Reuse of the virtual address is prevented by
	   leaving it in the global lists until we're done with it.
	   cpa takes care of the direct mappings. */
	read_lock(&vmlist_lock);
	for (p = vmlist; p; p = p->next) {
		if (p->addr == addr)
			break;
	}
	read_unlock(&vmlist_lock);

	if (!p) {
		pr_err("iounmap: bad address %p\n", addr);
		dump_stack();
		return;
	}

	/* Finally remove it */
	o = remove_vm_area((void *)addr);
	BUG_ON(p != o || o == NULL);
	kfree(p);
#endif
}
EXPORT_SYMBOL(iounmap);

#endif /* CHIP_HAS_MMIO() */
Commit	Line	Data
867e359b CM	1	/*
	2	* Copyright 2010 Tilera Corporation. All Rights Reserved.
	3	*
	4	* This program is free software; you can redistribute it and/or
	5	* modify it under the terms of the GNU General Public License
	6	* as published by the Free Software Foundation, version 2.
	7	*
	8	* This program is distributed in the hope that it will be useful, but
	9	* WITHOUT ANY WARRANTY; without even the implied warranty of
	10	* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
	11	* NON INFRINGEMENT. See the GNU General Public License for
	12	* more details.
	13	*/
	14
	15	#include <linux/sched.h>
	16	#include <linux/kernel.h>
	17	#include <linux/errno.h>
	18	#include <linux/mm.h>
	19	#include <linux/swap.h>
867e359b CM	20	#include <linux/highmem.h>
	21	#include <linux/slab.h>
	22	#include <linux/pagemap.h>
	23	#include <linux/spinlock.h>
	24	#include <linux/cpumask.h>
	25	#include <linux/module.h>
	26	#include <linux/io.h>
	27	#include <linux/vmalloc.h>
	28	#include <linux/smp.h>
	29
	30	#include <asm/system.h>
	31	#include <asm/pgtable.h>
	32	#include <asm/pgalloc.h>
	33	#include <asm/fixmap.h>
	34	#include <asm/tlb.h>
	35	#include <asm/tlbflush.h>
	36	#include <asm/homecache.h>
	37
	38	#define K(x) ((x) << (PAGE_SHIFT-10))
	39
	40	/*
	41	* The normal show_free_areas() is too verbose on Tile, with dozens
	42	* of processors and often four NUMA zones each with high and lowmem.
	43	*/
b2b755b5	44	void show_mem(unsigned int filter)
867e359b CM	45	{
	46	struct zone *zone;
	47
0707ad30	48	pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu"
867e359b CM	49	" free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu"
	50	" pagecache:%lu swap:%lu\n",
	51	(global_page_state(NR_ACTIVE_ANON) +
	52	global_page_state(NR_ACTIVE_FILE)),
	53	(global_page_state(NR_INACTIVE_ANON) +
	54	global_page_state(NR_INACTIVE_FILE)),
	55	global_page_state(NR_FILE_DIRTY),
	56	global_page_state(NR_WRITEBACK),
	57	global_page_state(NR_UNSTABLE_NFS),
	58	global_page_state(NR_FREE_PAGES),
	59	(global_page_state(NR_SLAB_RECLAIMABLE) +
	60	global_page_state(NR_SLAB_UNRECLAIMABLE)),
	61	global_page_state(NR_FILE_MAPPED),
	62	global_page_state(NR_PAGETABLE),
	63	global_page_state(NR_BOUNCE),
	64	global_page_state(NR_FILE_PAGES),
	65	nr_swap_pages);
	66
	67	for_each_zone(zone) {
	68	unsigned long flags, order, total = 0, largest_order = -1;
	69
	70	if (!populated_zone(zone))
	71	continue;
	72
867e359b CM	73	spin_lock_irqsave(&zone->lock, flags);
	74	for (order = 0; order < MAX_ORDER; order++) {
	75	int nr = zone->free_area[order].nr_free;
	76	total += nr << order;
	77	if (nr)
	78	largest_order = order;
	79	}
	80	spin_unlock_irqrestore(&zone->lock, flags);
0707ad30 CM	81	pr_err("Node %d %7s: %lukB (largest %luKb)\n",
0707ad30 CM	82	zone_to_nid(zone), zone->name,
867e359b CM	83	K(total), largest_order ? K(1UL) << largest_order : 0);
	84	}
	85	}
	86
	87	/*
	88	* Associate a virtual page frame with a given physical page frame
	89	* and protection flags for that frame.
	90	*/
	91	static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
	92	{
	93	pgd_t *pgd;
	94	pud_t *pud;
	95	pmd_t *pmd;
	96	pte_t *pte;
	97
	98	pgd = swapper_pg_dir + pgd_index(vaddr);
	99	if (pgd_none(*pgd)) {
	100	BUG();
	101	return;
	102	}
	103	pud = pud_offset(pgd, vaddr);
	104	if (pud_none(*pud)) {
	105	BUG();
	106	return;
	107	}
	108	pmd = pmd_offset(pud, vaddr);
	109	if (pmd_none(*pmd)) {
	110	BUG();
	111	return;
	112	}
	113	pte = pte_offset_kernel(pmd, vaddr);
	114	/* <pfn,flags> stored as-is, to permit clearing entries */
	115	set_pte(pte, pfn_pte(pfn, flags));
	116
	117	/*
	118	* It's enough to flush this one mapping.
	119	* This appears conservative since it is only called
	120	* from __set_fixmap.
	121	*/
	122	local_flush_tlb_page(NULL, vaddr, PAGE_SIZE);
	123	}
	124
867e359b CM	125	void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
	126	{
	127	unsigned long address = __fix_to_virt(idx);
	128
	129	if (idx >= __end_of_fixed_addresses) {
	130	BUG();
	131	return;
	132	}
	133	set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
	134	}
	135
	136	#if defined(CONFIG_HIGHPTE)
38a6f426	137	pte_t _pte_offset_map(pmd_t dir, unsigned long address)
867e359b	138	{
38a6f426	139	pte_t pte = kmap_atomic(pmd_page(dir)) +
867e359b CM	140	(pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK;
	141	return &pte[pte_index(address)];
	142	}
	143	#endif
	144
76c567fb CM	145	/**
	146	* shatter_huge_page() - ensure a given address is mapped by a small page.
	147	*
	148	* This function converts a huge PTE mapping kernel LOWMEM into a bunch
	149	* of small PTEs with the same caching. No cache flush required, but we
	150	* must do a global TLB flush.
	151	*
	152	* Any caller that wishes to modify a kernel mapping that might
	153	* have been made with a huge page should call this function,
	154	* since doing so properly avoids race conditions with installing the
	155	* newly-shattered page and then flushing all the TLB entries.
	156	*
	157	* @addr: Address at which to shatter any existing huge page.
	158	*/
	159	void shatter_huge_page(unsigned long addr)
	160	{
	161	pgd_t *pgd;
	162	pud_t *pud;
	163	pmd_t *pmd;
	164	unsigned long flags = 0; /* happy compiler */
	165	#ifdef __PAGETABLE_PMD_FOLDED
	166	struct list_head *pos;
	167	#endif
	168
	169	/* Get a pointer to the pmd entry that we need to change. */
	170	addr &= HPAGE_MASK;
	171	BUG_ON(pgd_addr_invalid(addr));
	172	BUG_ON(addr < PAGE_OFFSET); /* only for kernel LOWMEM */
	173	pgd = swapper_pg_dir + pgd_index(addr);
	174	pud = pud_offset(pgd, addr);
	175	BUG_ON(!pud_present(*pud));
	176	pmd = pmd_offset(pud, addr);
	177	BUG_ON(!pmd_present(*pmd));
	178	if (!pmd_huge_page(*pmd))
	179	return;
	180
	181	/*
	182	* Grab the pgd_lock, since we may need it to walk the pgd_list,
	183	* and since we need some kind of lock here to avoid races.
	184	*/
	185	spin_lock_irqsave(&pgd_lock, flags);
	186	if (!pmd_huge_page(*pmd)) {
	187	/* Lost the race to convert the huge page. */
	188	spin_unlock_irqrestore(&pgd_lock, flags);
	189	return;
	190	}
	191
	192	/* Shatter the huge page into the preallocated L2 page table. */
	193	pmd_populate_kernel(&init_mm, pmd,
	194	get_prealloc_pte(pte_pfn((pte_t )pmd)));
	195
	196	#ifdef __PAGETABLE_PMD_FOLDED
	197	/* Walk every pgd on the system and update the pmd there. */
	198	list_for_each(pos, &pgd_list) {
	199	pmd_t *copy_pmd;
	200	pgd = list_to_pgd(pos) + pgd_index(addr);
	201	pud = pud_offset(pgd, addr);
	202	copy_pmd = pmd_offset(pud, addr);
	203	__set_pmd(copy_pmd, *pmd);
	204	}
	205	#endif
	206
	207	/* Tell every cpu to notice the change. */
	208	flush_remote(0, 0, NULL, addr, HPAGE_SIZE, HPAGE_SIZE,
209	cpu_possible_mask, NULL, 0);
210
211	/* Hold the lock until the TLB flush is finished to avoid races. */
212	spin_unlock_irqrestore(&pgd_lock, flags);
213	}
214
867e359b CM	215	/*
	216	* List of all pgd's needed so it can invalidate entries in both cached
	217	* and uncached pgd's. This is essentially codepath-based locking
	218	* against pageattr.c; it is the unique case in which a valid change
	219	* of kernel pagetables can't be lazily synchronized by vmalloc faults.
	220	* vmalloc faults work because attached pagetables are never freed.
	221	* The locking scheme was chosen on the basis of manfred's
	222	* recommendations and having no core impact whatsoever.
	223	* -- wli
	224	*/
	225	DEFINE_SPINLOCK(pgd_lock);
	226	LIST_HEAD(pgd_list);
	227
	228	static inline void pgd_list_add(pgd_t *pgd)
	229	{
	230	list_add(pgd_to_list(pgd), &pgd_list);
	231	}
	232
	233	static inline void pgd_list_del(pgd_t *pgd)
	234	{
	235	list_del(pgd_to_list(pgd));
	236	}
	237
	238	#define KERNEL_PGD_INDEX_START pgd_index(PAGE_OFFSET)
	239	#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_INDEX_START)
	240
	241	static void pgd_ctor(pgd_t *pgd)
	242	{
	243	unsigned long flags;
	244
	245	memset(pgd, 0, KERNEL_PGD_INDEX_START*sizeof(pgd_t));
	246	spin_lock_irqsave(&pgd_lock, flags);
	247
	248	#ifndef __tilegx__
	249	/*
	250	* Check that the user interrupt vector has no L2.
	251	* It never should for the swapper, and new page tables
	252	* should always start with an empty user interrupt vector.
	253	*/
	254	BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
	255	#endif
	256
76c567fb CM	257	memcpy(pgd + KERNEL_PGD_INDEX_START,
	258	swapper_pg_dir + KERNEL_PGD_INDEX_START,
	259	KERNEL_PGD_PTRS * sizeof(pgd_t));
867e359b CM	260
	261	pgd_list_add(pgd);
	262	spin_unlock_irqrestore(&pgd_lock, flags);
	263	}
	264
	265	static void pgd_dtor(pgd_t *pgd)
	266	{
	267	unsigned long flags; /* can be called from interrupt context */
	268
	269	spin_lock_irqsave(&pgd_lock, flags);
	270	pgd_list_del(pgd);
	271	spin_unlock_irqrestore(&pgd_lock, flags);
	272	}
	273
	274	pgd_t pgd_alloc(struct mm_struct mm)
	275	{
	276	pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
	277	if (pgd)
	278	pgd_ctor(pgd);
	279	return pgd;
	280	}
	281
	282	void pgd_free(struct mm_struct mm, pgd_t pgd)
	283	{
	284	pgd_dtor(pgd);
	285	kmem_cache_free(pgd_cache, pgd);
	286	}
	287
	288
	289	#define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER)
	290
	291	struct page pte_alloc_one(struct mm_struct mm, unsigned long address)
	292	{
76c567fb	293	gfp_t flags = GFP_KERNEL\|__GFP_REPEAT\|__GFP_ZERO;
867e359b	294	struct page *p;
76c567fb CM	295	#if L2_USER_PGTABLE_ORDER > 0
	296	int i;
	297	#endif
867e359b CM	298
	299	#ifdef CONFIG_HIGHPTE
	300	flags \|= __GFP_HIGHMEM;
	301	#endif
	302
	303	p = alloc_pages(flags, L2_USER_PGTABLE_ORDER);
	304	if (p == NULL)
	305	return NULL;
	306
76c567fb CM	307	#if L2_USER_PGTABLE_ORDER > 0
	308	/*
	309	* Make every page have a page_count() of one, not just the first.
	310	* We don't use __GFP_COMP since it doesn't look like it works
	311	* correctly with tlb_remove_page().
	312	*/
	313	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
	314	init_page_count(p+i);
	315	inc_zone_page_state(p+i, NR_PAGETABLE);
	316	}
	317	#endif
	318
867e359b CM	319	pgtable_page_ctor(p);
	320	return p;
	321	}
	322
	323	/*
	324	* Free page immediately (used in __pte_alloc if we raced with another
	325	* process). We have to correct whatever pte_alloc_one() did before
	326	* returning the pages to the allocator.
	327	*/
	328	void pte_free(struct mm_struct mm, struct page p)
	329	{
76c567fb CM	330	int i;
76c567fb CM	331
867e359b	332	pgtable_page_dtor(p);
76c567fb CM	333	__free_page(p);
	334
	335	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
	336	__free_page(p+i);
	337	dec_zone_page_state(p+i, NR_PAGETABLE);
	338	}
867e359b CM	339	}
	340
	341	void __pte_free_tlb(struct mmu_gather tlb, struct page pte,
	342	unsigned long address)
	343	{
	344	int i;
	345
	346	pgtable_page_dtor(pte);
76c567fb CM	347	tlb_remove_page(tlb, pte);
	348
	349	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
342d87ef	350	tlb_remove_page(tlb, pte + i);
76c567fb CM	351	dec_zone_page_state(pte + i, NR_PAGETABLE);
76c567fb CM	352	}
867e359b CM	353	}
	354
	355	#ifndef __tilegx__
	356
	357	/*
	358	* FIXME: needs to be atomic vs hypervisor writes. For now we make the
	359	* window of vulnerability a bit smaller by doing an unlocked 8-bit update.
	360	*/
	361	int ptep_test_and_clear_young(struct vm_area_struct *vma,
	362	unsigned long addr, pte_t *ptep)
	363	{
	364	#if HV_PTE_INDEX_ACCESSED < 8 \|\| HV_PTE_INDEX_ACCESSED >= 16
	365	# error Code assumes HV_PTE "accessed" bit in second byte
	366	#endif
	367	u8 tmp = (u8 )ptep;
	368	u8 second_byte = tmp[1];
	369	if (!(second_byte & (1 << (HV_PTE_INDEX_ACCESSED - 8))))
	370	return 0;
	371	tmp[1] = second_byte & ~(1 << (HV_PTE_INDEX_ACCESSED - 8));
	372	return 1;
	373	}
	374
	375	/*
	376	* This implementation is atomic vs hypervisor writes, since the hypervisor
	377	* always writes the low word (where "accessed" and "dirty" are) and this
	378	* routine only writes the high word.
	379	*/
	380	void ptep_set_wrprotect(struct mm_struct *mm,
	381	unsigned long addr, pte_t *ptep)
	382	{
	383	#if HV_PTE_INDEX_WRITABLE < 32
	384	# error Code assumes HV_PTE "writable" bit in high word
	385	#endif
	386	u32 tmp = (u32 )ptep;
	387	tmp[1] = tmp[1] & ~(1 << (HV_PTE_INDEX_WRITABLE - 32));
	388	}
	389
	390	#endif
	391
	392	pte_t virt_to_pte(struct mm_struct mm, unsigned long addr)
	393	{
	394	pgd_t *pgd;
	395	pud_t *pud;
	396	pmd_t *pmd;
	397
	398	if (pgd_addr_invalid(addr))
	399	return NULL;
	400
	401	pgd = mm ? pgd_offset(mm, addr) : swapper_pg_dir + pgd_index(addr);
	402	pud = pud_offset(pgd, addr);
	403	if (!pud_present(*pud))
	404	return NULL;
	405	pmd = pmd_offset(pud, addr);
	406	if (pmd_huge_page(*pmd))
	407	return (pte_t *)pmd;
	408	if (!pmd_present(*pmd))
	409	return NULL;
	410	return pte_offset_kernel(pmd, addr);
	411	}
	412
	413	pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu)
	414	{
	415	unsigned int width = smp_width;
	416	int x = cpu % width;
417	int y = cpu / width;
418	BUG_ON(y >= smp_height);
419	BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
420	BUG_ON(cpu < 0 \|\| cpu >= NR_CPUS);
421	BUG_ON(!cpu_is_valid_lotar(cpu));
422	return hv_pte_set_lotar(prot, HV_XY_TO_LOTAR(x, y));
423	}
424
425	int get_remote_cache_cpu(pgprot_t prot)
426	{
427	HV_LOTAR lotar = hv_pte_get_lotar(prot);
428	int x = HV_LOTAR_X(lotar);
429	int y = HV_LOTAR_Y(lotar);
430	BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
431	return x + y * smp_width;
432	}
433
76c567fb CM	434	/*
	435	* Convert a kernel VA to a PA and homing information.
	436	*/
	437	int va_to_cpa_and_pte(void va, unsigned long long cpa, pte_t *pte)
867e359b	438	{
76c567fb CM	439	struct page *page = virt_to_page(va);
76c567fb CM	440	pte_t null_pte = { 0 };
867e359b	441
76c567fb CM	442	*cpa = __pa(va);
	443
	444	/* Note that this is not writing a page table, just returning a pte. */
	445	*pte = pte_set_home(null_pte, page_home(page));
867e359b	446
76c567fb CM	447	return 0; /* return non-zero if not hfh? */
	448	}
	449	EXPORT_SYMBOL(va_to_cpa_and_pte);
	450
	451	void __set_pte(pte_t *ptep, pte_t pte)
	452	{
867e359b CM	453	#ifdef __tilegx__
	454	*ptep = pte;
	455	#else
76c567fb CM	456	# if HV_PTE_INDEX_PRESENT >= 32 \|\| HV_PTE_INDEX_MIGRATING >= 32
	457	# error Must write the present and migrating bits last
	458	# endif
	459	if (pte_present(pte)) {
	460	((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
	461	barrier();
	462	((u32 *)ptep)[0] = (u32)(pte_val(pte));
	463	} else {
	464	((u32 *)ptep)[0] = (u32)(pte_val(pte));
	465	barrier();
	466	((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
	467	}
	468	#endif /* __tilegx__ */
	469	}
	470
	471	void set_pte(pte_t *ptep, pte_t pte)
	472	{
	473	struct page *page = pfn_to_page(pte_pfn(pte));
	474
	475	/* Update the home of a PTE if necessary */
	476	pte = pte_set_home(pte, page_home(page));
	477
	478	__set_pte(ptep, pte);
867e359b CM	479	}
	480
	481	/* Can this mm load a PTE with cached_priority set? */
	482	static inline int mm_is_priority_cached(struct mm_struct *mm)
	483	{
	484	return mm->context.priority_cached;
	485	}
	486
	487	/*
	488	* Add a priority mapping to an mm_context and
	489	* notify the hypervisor if this is the first one.
	490	*/
	491	void start_mm_caching(struct mm_struct *mm)
	492	{
	493	if (!mm_is_priority_cached(mm)) {
	494	mm->context.priority_cached = -1U;
	495	hv_set_caching(-1U);
	496	}
	497	}
	498
	499	/*
	500	* Validate and return the priority_cached flag. We know if it's zero
	501	* that we don't need to scan, since we immediately set it non-zero
	502	* when we first consider a MAP_CACHE_PRIORITY mapping.
	503	*
	504	* We only _try_ to acquire the mmap_sem semaphore; if we can't acquire it,
	505	* since we're in an interrupt context (servicing switch_mm) we don't
	506	* worry about it and don't unset the "priority_cached" field.
	507	* Presumably we'll come back later and have more luck and clear
	508	* the value then; for now we'll just keep the cache marked for priority.
	509	*/
	510	static unsigned int update_priority_cached(struct mm_struct *mm)
	511	{
	512	if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) {
	513	struct vm_area_struct *vm;
	514	for (vm = mm->mmap; vm; vm = vm->vm_next) {
	515	if (hv_pte_get_cached_priority(vm->vm_page_prot))
	516	break;
	517	}
	518	if (vm == NULL)
	519	mm->context.priority_cached = 0;
	520	up_write(&mm->mmap_sem);
	521	}
	522	return mm->context.priority_cached;
	523	}
	524
	525	/* Set caching correctly for an mm that we are switching to. */
	526	void check_mm_caching(struct mm_struct prev, struct mm_struct next)
	527	{
	528	if (!mm_is_priority_cached(next)) {
	529	/*
	530	* If the new mm doesn't use priority caching, just see if we
	531	* need the hv_set_caching(), or can assume it's already zero.
	532	*/
	533	if (mm_is_priority_cached(prev))
	534	hv_set_caching(0);
	535	} else {
	536	hv_set_caching(update_priority_cached(next));
	537	}
	538	}
	539
	540	#if CHIP_HAS_MMIO()
	541
	542	/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
543	void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
544	pgprot_t home)
545	{
546	void *addr;
547	struct vm_struct *area;
548	unsigned long offset, last_addr;
549	pgprot_t pgprot;
550
551	/* Don't allow wraparound or zero size */
552	last_addr = phys_addr + size - 1;
553	if (!size \|\| last_addr < phys_addr)
554	return NULL;
555
556	/* Create a read/write, MMIO VA mapping homed at the requested shim. */
557	pgprot = PAGE_KERNEL;
558	pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
559	pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
560
561	/*
562	* Mappings have to be page-aligned
563	*/
564	offset = phys_addr & ~PAGE_MASK;
565	phys_addr &= PAGE_MASK;
566	size = PAGE_ALIGN(last_addr+1) - phys_addr;
567
568	/*
569	* Ok, go for it..
570	*/
571	area = get_vm_area(size, VM_IOREMAP /* \| other flags? */);
572	if (!area)
573	return NULL;
574	area->phys_addr = phys_addr;
575	addr = area->addr;
576	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
577	phys_addr, pgprot)) {
578	remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
579	return NULL;
580	}
581	return (__force void __iomem ) (offset + (char )addr);
582	}
583	EXPORT_SYMBOL(ioremap_prot);
584
585	/* Map a PCI MMIO bus address into VA space. */
586	void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
587	{
588	panic("ioremap for PCI MMIO is not supported");
589	}
590	EXPORT_SYMBOL(ioremap);
591
592	/* Unmap an MMIO VA mapping. */
593	void iounmap(volatile void __iomem *addr_in)
594	{
595	volatile void __iomem addr = (volatile void __iomem )
596	(PAGE_MASK & (unsigned long __force)addr_in);
597	#if 1
598	vunmap((void * __force)addr);
599	#else
600	/* x86 uses this complicated flow instead of vunmap(). Is
601	* there any particular reason we should do the same? */
602	struct vm_struct p, o;
603
604	/* Use the vm area unlocked, assuming the caller
605	ensures there isn't another iounmap for the same address
606	in parallel. Reuse of the virtual address is prevented by
607	leaving it in the global lists until we're done with it.
608	cpa takes care of the direct mappings. */
609	read_lock(&vmlist_lock);
610	for (p = vmlist; p; p = p->next) {
611	if (p->addr == addr)
612	break;
613	}
614	read_unlock(&vmlist_lock);
615
616	if (!p) {
0707ad30	617	pr_err("iounmap: bad address %p\n", addr);
867e359b CM	618	dump_stack();
	619	return;
	620	}
	621
	622	/* Finally remove it */
	623	o = remove_vm_area((void *)addr);
	624	BUG_ON(p != o \|\| o == NULL);
	625	kfree(p);
	626	#endif
	627	}
	628	EXPORT_SYMBOL(iounmap);
	629
	630	#endif /* CHIP_HAS_MMIO() */