[deliverable/linux.git] / arch / tile / mm / pgtable.c

/*
 * Copyright 2010 Tilera Corporation. All Rights Reserved.
 *
 *   This program is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU General Public License
 *   as published by the Free Software Foundation, version 2.
 *
 *   This program is distributed in the hope that it will be useful, but
 *   WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 *   NON INFRINGEMENT.  See the GNU General Public License for
 *   more details.
 */

#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/io.h>
#include <linux/vmalloc.h>
#include <linux/smp.h>

#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/homecache.h>

#define K(x) ((x) << (PAGE_SHIFT-10))

/*
 * The normal show_free_areas() is too verbose on Tile, with dozens
 * of processors and often four NUMA zones each with high and lowmem.
 */
void show_mem(void)
{
	struct zone *zone;

	pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu"
	       " free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu"
	       " pagecache:%lu swap:%lu\n",
	       (global_page_state(NR_ACTIVE_ANON) +
		global_page_state(NR_ACTIVE_FILE)),
	       (global_page_state(NR_INACTIVE_ANON) +
		global_page_state(NR_INACTIVE_FILE)),
	       global_page_state(NR_FILE_DIRTY),
	       global_page_state(NR_WRITEBACK),
	       global_page_state(NR_UNSTABLE_NFS),
	       global_page_state(NR_FREE_PAGES),
	       (global_page_state(NR_SLAB_RECLAIMABLE) +
		global_page_state(NR_SLAB_UNRECLAIMABLE)),
	       global_page_state(NR_FILE_MAPPED),
	       global_page_state(NR_PAGETABLE),
	       global_page_state(NR_BOUNCE),
	       global_page_state(NR_FILE_PAGES),
	       nr_swap_pages);

	for_each_zone(zone) {
		unsigned long flags, order, total = 0, largest_order = -1;

		if (!populated_zone(zone))
			continue;

		spin_lock_irqsave(&zone->lock, flags);
		for (order = 0; order < MAX_ORDER; order++) {
			int nr = zone->free_area[order].nr_free;
			total += nr << order;
			if (nr)
				largest_order = order;
		}
		spin_unlock_irqrestore(&zone->lock, flags);
		pr_err("Node %d %7s: %lukB (largest %luKb)\n",
		       zone_to_nid(zone), zone->name,
		       K(total), largest_order ? K(1UL) << largest_order : 0);
	}
}

/*
 * Associate a virtual page frame with a given physical page frame
 * and protection flags for that frame.
 */
static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pgd = swapper_pg_dir + pgd_index(vaddr);
	if (pgd_none(*pgd)) {
		BUG();
		return;
	}
	pud = pud_offset(pgd, vaddr);
	if (pud_none(*pud)) {
		BUG();
		return;
	}
	pmd = pmd_offset(pud, vaddr);
	if (pmd_none(*pmd)) {
		BUG();
		return;
	}
	pte = pte_offset_kernel(pmd, vaddr);
	/* <pfn,flags> stored as-is, to permit clearing entries */
	set_pte(pte, pfn_pte(pfn, flags));

	/*
	 * It's enough to flush this one mapping.
	 * This appears conservative since it is only called
	 * from __set_fixmap.
	 */
	local_flush_tlb_page(NULL, vaddr, PAGE_SIZE);
}

void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
{
	unsigned long address = __fix_to_virt(idx);

	if (idx >= __end_of_fixed_addresses) {
		BUG();
		return;
	}
	set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
}

#if defined(CONFIG_HIGHPTE)
pte_t *_pte_offset_map(pmd_t *dir, unsigned long address, enum km_type type)
{
	pte_t *pte = kmap_atomic(pmd_page(*dir), type) +
		(pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK;
	return &pte[pte_index(address)];
}
#endif

/*
 * List of all pgd's needed so it can invalidate entries in both cached
 * and uncached pgd's. This is essentially codepath-based locking
 * against pageattr.c; it is the unique case in which a valid change
 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
 * vmalloc faults work because attached pagetables are never freed.
 * The locking scheme was chosen on the basis of manfred's
 * recommendations and having no core impact whatsoever.
 * -- wli
 */
DEFINE_SPINLOCK(pgd_lock);
LIST_HEAD(pgd_list);

static inline void pgd_list_add(pgd_t *pgd)
{
	list_add(pgd_to_list(pgd), &pgd_list);
}

static inline void pgd_list_del(pgd_t *pgd)
{
	list_del(pgd_to_list(pgd));
}

#define KERNEL_PGD_INDEX_START pgd_index(PAGE_OFFSET)
#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_INDEX_START)

static void pgd_ctor(pgd_t *pgd)
{
	unsigned long flags;

	memset(pgd, 0, KERNEL_PGD_INDEX_START*sizeof(pgd_t));
	spin_lock_irqsave(&pgd_lock, flags);

#ifndef __tilegx__
	/*
	 * Check that the user interrupt vector has no L2.
	 * It never should for the swapper, and new page tables
	 * should always start with an empty user interrupt vector.
	 */
	BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
#endif

	clone_pgd_range(pgd + KERNEL_PGD_INDEX_START,
			swapper_pg_dir + KERNEL_PGD_INDEX_START,
			KERNEL_PGD_PTRS);

	pgd_list_add(pgd);
	spin_unlock_irqrestore(&pgd_lock, flags);
}

static void pgd_dtor(pgd_t *pgd)
{
	unsigned long flags; /* can be called from interrupt context */

	spin_lock_irqsave(&pgd_lock, flags);
	pgd_list_del(pgd);
	spin_unlock_irqrestore(&pgd_lock, flags);
}

pgd_t *pgd_alloc(struct mm_struct *mm)
{
	pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
	if (pgd)
		pgd_ctor(pgd);
	return pgd;
}

void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
	pgd_dtor(pgd);
	kmem_cache_free(pgd_cache, pgd);
}


#define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER)

struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
	gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO|__GFP_COMP;
	struct page *p;

#ifdef CONFIG_HIGHPTE
	flags |= __GFP_HIGHMEM;
#endif

	p = alloc_pages(flags, L2_USER_PGTABLE_ORDER);
	if (p == NULL)
		return NULL;

	pgtable_page_ctor(p);
	return p;
}

/*
 * Free page immediately (used in __pte_alloc if we raced with another
 * process).  We have to correct whatever pte_alloc_one() did before
 * returning the pages to the allocator.
 */
void pte_free(struct mm_struct *mm, struct page *p)
{
	pgtable_page_dtor(p);
	__free_pages(p, L2_USER_PGTABLE_ORDER);
}

void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
		    unsigned long address)
{
	int i;

	pgtable_page_dtor(pte);
	tlb->need_flush = 1;
	if (tlb_fast_mode(tlb)) {
		struct page *pte_pages[L2_USER_PGTABLE_PAGES];
		for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i)
			pte_pages[i] = pte + i;
		free_pages_and_swap_cache(pte_pages, L2_USER_PGTABLE_PAGES);
		return;
	}
	for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) {
		tlb->pages[tlb->nr++] = pte + i;
		if (tlb->nr >= FREE_PTE_NR)
			tlb_flush_mmu(tlb, 0, 0);
	}
}

#ifndef __tilegx__

/*
 * FIXME: needs to be atomic vs hypervisor writes.  For now we make the
 * window of vulnerability a bit smaller by doing an unlocked 8-bit update.
 */
int ptep_test_and_clear_young(struct vm_area_struct *vma,
			      unsigned long addr, pte_t *ptep)
{
#if HV_PTE_INDEX_ACCESSED < 8 || HV_PTE_INDEX_ACCESSED >= 16
# error Code assumes HV_PTE "accessed" bit in second byte
#endif
	u8 *tmp = (u8 *)ptep;
	u8 second_byte = tmp[1];
	if (!(second_byte & (1 << (HV_PTE_INDEX_ACCESSED - 8))))
		return 0;
	tmp[1] = second_byte & ~(1 << (HV_PTE_INDEX_ACCESSED - 8));
	return 1;
}

/*
 * This implementation is atomic vs hypervisor writes, since the hypervisor
 * always writes the low word (where "accessed" and "dirty" are) and this
 * routine only writes the high word.
 */
void ptep_set_wrprotect(struct mm_struct *mm,
			unsigned long addr, pte_t *ptep)
{
#if HV_PTE_INDEX_WRITABLE < 32
# error Code assumes HV_PTE "writable" bit in high word
#endif
	u32 *tmp = (u32 *)ptep;
	tmp[1] = tmp[1] & ~(1 << (HV_PTE_INDEX_WRITABLE - 32));
}

#endif

pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	if (pgd_addr_invalid(addr))
		return NULL;

	pgd = mm ? pgd_offset(mm, addr) : swapper_pg_dir + pgd_index(addr);
	pud = pud_offset(pgd, addr);
	if (!pud_present(*pud))
		return NULL;
	pmd = pmd_offset(pud, addr);
	if (pmd_huge_page(*pmd))
		return (pte_t *)pmd;
	if (!pmd_present(*pmd))
		return NULL;
	return pte_offset_kernel(pmd, addr);
}

pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu)
{
	unsigned int width = smp_width;
	int x = cpu % width;
	int y = cpu / width;
	BUG_ON(y >= smp_height);
	BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
	BUG_ON(cpu < 0 || cpu >= NR_CPUS);
	BUG_ON(!cpu_is_valid_lotar(cpu));
	return hv_pte_set_lotar(prot, HV_XY_TO_LOTAR(x, y));
}

int get_remote_cache_cpu(pgprot_t prot)
{
	HV_LOTAR lotar = hv_pte_get_lotar(prot);
	int x = HV_LOTAR_X(lotar);
	int y = HV_LOTAR_Y(lotar);
	BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
	return x + y * smp_width;
}

void set_pte_order(pte_t *ptep, pte_t pte, int order)
{
	unsigned long pfn = pte_pfn(pte);
	struct page *page = pfn_to_page(pfn);

	/* Update the home of a PTE if necessary */
	pte = pte_set_home(pte, page_home(page));

#ifdef __tilegx__
	*ptep = pte;
#else
	/*
	 * When setting a PTE, write the high bits first, then write
	 * the low bits.  This sets the "present" bit only after the
	 * other bits are in place.  If a particular PTE update
	 * involves transitioning from one valid PTE to another, it
	 * may be necessary to call set_pte_order() more than once,
	 * transitioning via a suitable intermediate state.
	 * Note that this sequence also means that if we are transitioning
	 * from any migrating PTE to a non-migrating one, we will not
	 * see a half-updated PTE with the migrating bit off.
	 */
#if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
# error Must write the present and migrating bits last
#endif
	((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
	barrier();
	((u32 *)ptep)[0] = (u32)(pte_val(pte));
#endif
}

/* Can this mm load a PTE with cached_priority set? */
static inline int mm_is_priority_cached(struct mm_struct *mm)
{
	return mm->context.priority_cached;
}

/*
 * Add a priority mapping to an mm_context and
 * notify the hypervisor if this is the first one.
 */
void start_mm_caching(struct mm_struct *mm)
{
	if (!mm_is_priority_cached(mm)) {
		mm->context.priority_cached = -1U;
		hv_set_caching(-1U);
	}
}

/*
 * Validate and return the priority_cached flag.  We know if it's zero
 * that we don't need to scan, since we immediately set it non-zero
 * when we first consider a MAP_CACHE_PRIORITY mapping.
 *
 * We only _try_ to acquire the mmap_sem semaphore; if we can't acquire it,
 * since we're in an interrupt context (servicing switch_mm) we don't
 * worry about it and don't unset the "priority_cached" field.
 * Presumably we'll come back later and have more luck and clear
 * the value then; for now we'll just keep the cache marked for priority.
 */
static unsigned int update_priority_cached(struct mm_struct *mm)
{
	if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) {
		struct vm_area_struct *vm;
		for (vm = mm->mmap; vm; vm = vm->vm_next) {
			if (hv_pte_get_cached_priority(vm->vm_page_prot))
				break;
		}
		if (vm == NULL)
			mm->context.priority_cached = 0;
		up_write(&mm->mmap_sem);
	}
	return mm->context.priority_cached;
}

/* Set caching correctly for an mm that we are switching to. */
void check_mm_caching(struct mm_struct *prev, struct mm_struct *next)
{
	if (!mm_is_priority_cached(next)) {
		/*
		 * If the new mm doesn't use priority caching, just see if we
		 * need the hv_set_caching(), or can assume it's already zero.
		 */
		if (mm_is_priority_cached(prev))
			hv_set_caching(0);
	} else {
		hv_set_caching(update_priority_cached(next));
	}
}

#if CHIP_HAS_MMIO()

/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
			   pgprot_t home)
{
	void *addr;
	struct vm_struct *area;
	unsigned long offset, last_addr;
	pgprot_t pgprot;

	/* Don't allow wraparound or zero size */
	last_addr = phys_addr + size - 1;
	if (!size || last_addr < phys_addr)
		return NULL;

	/* Create a read/write, MMIO VA mapping homed at the requested shim. */
	pgprot = PAGE_KERNEL;
	pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
	pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));

	/*
	 * Mappings have to be page-aligned
	 */
	offset = phys_addr & ~PAGE_MASK;
	phys_addr &= PAGE_MASK;
	size = PAGE_ALIGN(last_addr+1) - phys_addr;

	/*
	 * Ok, go for it..
	 */
	area = get_vm_area(size, VM_IOREMAP /* | other flags? */);
	if (!area)
		return NULL;
	area->phys_addr = phys_addr;
	addr = area->addr;
	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
			       phys_addr, pgprot)) {
		remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
		return NULL;
	}
	return (__force void __iomem *) (offset + (char *)addr);
}
EXPORT_SYMBOL(ioremap_prot);

/* Map a PCI MMIO bus address into VA space. */
void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
{
	panic("ioremap for PCI MMIO is not supported");
}
EXPORT_SYMBOL(ioremap);

/* Unmap an MMIO VA mapping. */
void iounmap(volatile void __iomem *addr_in)
{
	volatile void __iomem *addr = (volatile void __iomem *)
		(PAGE_MASK & (unsigned long __force)addr_in);
#if 1
	vunmap((void * __force)addr);
#else
	/* x86 uses this complicated flow instead of vunmap().  Is
	 * there any particular reason we should do the same? */
	struct vm_struct *p, *o;

	/* Use the vm area unlocked, assuming the caller
	   ensures there isn't another iounmap for the same address
	   in parallel. Reuse of the virtual address is prevented by
	   leaving it in the global lists until we're done with it.
	   cpa takes care of the direct mappings. */
	read_lock(&vmlist_lock);
	for (p = vmlist; p; p = p->next) {
		if (p->addr == addr)
			break;
	}
	read_unlock(&vmlist_lock);

	if (!p) {
		pr_err("iounmap: bad address %p\n", addr);
		dump_stack();
		return;
	}

	/* Finally remove it */
	o = remove_vm_area((void *)addr);
	BUG_ON(p != o || o == NULL);
	kfree(p);
#endif
}
EXPORT_SYMBOL(iounmap);

#endif /* CHIP_HAS_MMIO() */
Commit	Line	Data
867e359b CM	1	/*
	2	* Copyright 2010 Tilera Corporation. All Rights Reserved.
	3	*
	4	* This program is free software; you can redistribute it and/or
	5	* modify it under the terms of the GNU General Public License
	6	* as published by the Free Software Foundation, version 2.
	7	*
	8	* This program is distributed in the hope that it will be useful, but
	9	* WITHOUT ANY WARRANTY; without even the implied warranty of
	10	* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
	11	* NON INFRINGEMENT. See the GNU General Public License for
	12	* more details.
	13	*/
	14
	15	#include <linux/sched.h>
	16	#include <linux/kernel.h>
	17	#include <linux/errno.h>
	18	#include <linux/mm.h>
	19	#include <linux/swap.h>
867e359b CM	20	#include <linux/highmem.h>
	21	#include <linux/slab.h>
	22	#include <linux/pagemap.h>
	23	#include <linux/spinlock.h>
	24	#include <linux/cpumask.h>
	25	#include <linux/module.h>
	26	#include <linux/io.h>
	27	#include <linux/vmalloc.h>
	28	#include <linux/smp.h>
	29
	30	#include <asm/system.h>
	31	#include <asm/pgtable.h>
	32	#include <asm/pgalloc.h>
	33	#include <asm/fixmap.h>
	34	#include <asm/tlb.h>
	35	#include <asm/tlbflush.h>
	36	#include <asm/homecache.h>
	37
	38	#define K(x) ((x) << (PAGE_SHIFT-10))
	39
	40	/*
	41	* The normal show_free_areas() is too verbose on Tile, with dozens
	42	* of processors and often four NUMA zones each with high and lowmem.
	43	*/
	44	void show_mem(void)
	45	{
	46	struct zone *zone;
	47
0707ad30	48	pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu"
867e359b CM	49	" free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu"
	50	" pagecache:%lu swap:%lu\n",
	51	(global_page_state(NR_ACTIVE_ANON) +
	52	global_page_state(NR_ACTIVE_FILE)),
	53	(global_page_state(NR_INACTIVE_ANON) +
	54	global_page_state(NR_INACTIVE_FILE)),
	55	global_page_state(NR_FILE_DIRTY),
	56	global_page_state(NR_WRITEBACK),
	57	global_page_state(NR_UNSTABLE_NFS),
	58	global_page_state(NR_FREE_PAGES),
	59	(global_page_state(NR_SLAB_RECLAIMABLE) +
	60	global_page_state(NR_SLAB_UNRECLAIMABLE)),
	61	global_page_state(NR_FILE_MAPPED),
	62	global_page_state(NR_PAGETABLE),
	63	global_page_state(NR_BOUNCE),
	64	global_page_state(NR_FILE_PAGES),
	65	nr_swap_pages);
	66
	67	for_each_zone(zone) {
	68	unsigned long flags, order, total = 0, largest_order = -1;
	69
	70	if (!populated_zone(zone))
	71	continue;
	72
867e359b CM	73	spin_lock_irqsave(&zone->lock, flags);
	74	for (order = 0; order < MAX_ORDER; order++) {
	75	int nr = zone->free_area[order].nr_free;
	76	total += nr << order;
	77	if (nr)
	78	largest_order = order;
	79	}
	80	spin_unlock_irqrestore(&zone->lock, flags);
0707ad30 CM	81	pr_err("Node %d %7s: %lukB (largest %luKb)\n",
0707ad30 CM	82	zone_to_nid(zone), zone->name,
867e359b CM	83	K(total), largest_order ? K(1UL) << largest_order : 0);
	84	}
	85	}
	86
	87	/*
	88	* Associate a virtual page frame with a given physical page frame
	89	* and protection flags for that frame.
	90	*/
	91	static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
	92	{
	93	pgd_t *pgd;
	94	pud_t *pud;
	95	pmd_t *pmd;
	96	pte_t *pte;
	97
	98	pgd = swapper_pg_dir + pgd_index(vaddr);
	99	if (pgd_none(*pgd)) {
	100	BUG();
	101	return;
	102	}
	103	pud = pud_offset(pgd, vaddr);
	104	if (pud_none(*pud)) {
	105	BUG();
	106	return;
	107	}
	108	pmd = pmd_offset(pud, vaddr);
	109	if (pmd_none(*pmd)) {
	110	BUG();
	111	return;
	112	}
	113	pte = pte_offset_kernel(pmd, vaddr);
	114	/* <pfn,flags> stored as-is, to permit clearing entries */
	115	set_pte(pte, pfn_pte(pfn, flags));
	116
	117	/*
	118	* It's enough to flush this one mapping.
	119	* This appears conservative since it is only called
	120	* from __set_fixmap.
	121	*/
	122	local_flush_tlb_page(NULL, vaddr, PAGE_SIZE);
	123	}
	124
867e359b CM	125	void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
	126	{
	127	unsigned long address = __fix_to_virt(idx);
	128
	129	if (idx >= __end_of_fixed_addresses) {
	130	BUG();
	131	return;
	132	}
	133	set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
	134	}
	135
	136	#if defined(CONFIG_HIGHPTE)
	137	pte_t _pte_offset_map(pmd_t dir, unsigned long address, enum km_type type)
	138	{
	139	pte_t pte = kmap_atomic(pmd_page(dir), type) +
	140	(pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK;
	141	return &pte[pte_index(address)];
	142	}
	143	#endif
	144
	145	/*
	146	* List of all pgd's needed so it can invalidate entries in both cached
	147	* and uncached pgd's. This is essentially codepath-based locking
	148	* against pageattr.c; it is the unique case in which a valid change
	149	* of kernel pagetables can't be lazily synchronized by vmalloc faults.
	150	* vmalloc faults work because attached pagetables are never freed.
	151	* The locking scheme was chosen on the basis of manfred's
	152	* recommendations and having no core impact whatsoever.
	153	* -- wli
	154	*/
	155	DEFINE_SPINLOCK(pgd_lock);
	156	LIST_HEAD(pgd_list);
	157
	158	static inline void pgd_list_add(pgd_t *pgd)
	159	{
	160	list_add(pgd_to_list(pgd), &pgd_list);
	161	}
	162
	163	static inline void pgd_list_del(pgd_t *pgd)
	164	{
	165	list_del(pgd_to_list(pgd));
	166	}
	167
	168	#define KERNEL_PGD_INDEX_START pgd_index(PAGE_OFFSET)
	169	#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_INDEX_START)
	170
	171	static void pgd_ctor(pgd_t *pgd)
	172	{
	173	unsigned long flags;
	174
	175	memset(pgd, 0, KERNEL_PGD_INDEX_START*sizeof(pgd_t));
	176	spin_lock_irqsave(&pgd_lock, flags);
	177
	178	#ifndef __tilegx__
	179	/*
	180	* Check that the user interrupt vector has no L2.
	181	* It never should for the swapper, and new page tables
	182	* should always start with an empty user interrupt vector.
	183	*/
	184	BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
	185	#endif
	186
	187	clone_pgd_range(pgd + KERNEL_PGD_INDEX_START,
	188	swapper_pg_dir + KERNEL_PGD_INDEX_START,
189	KERNEL_PGD_PTRS);
190
191	pgd_list_add(pgd);
192	spin_unlock_irqrestore(&pgd_lock, flags);
193	}
194
195	static void pgd_dtor(pgd_t *pgd)
196	{
197	unsigned long flags; /* can be called from interrupt context */
198
199	spin_lock_irqsave(&pgd_lock, flags);
200	pgd_list_del(pgd);
201	spin_unlock_irqrestore(&pgd_lock, flags);
202	}
203
204	pgd_t pgd_alloc(struct mm_struct mm)
205	{
206	pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
207	if (pgd)
208	pgd_ctor(pgd);
209	return pgd;
210	}
211
212	void pgd_free(struct mm_struct mm, pgd_t pgd)
213	{
214	pgd_dtor(pgd);
215	kmem_cache_free(pgd_cache, pgd);
216	}
217
218
219	#define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER)
220
221	struct page pte_alloc_one(struct mm_struct mm, unsigned long address)
222	{
0707ad30	223	gfp_t flags = GFP_KERNEL\|__GFP_REPEAT\|__GFP_ZERO\|__GFP_COMP;
867e359b CM	224	struct page *p;
	225
	226	#ifdef CONFIG_HIGHPTE
	227	flags \|= __GFP_HIGHMEM;
	228	#endif
	229
	230	p = alloc_pages(flags, L2_USER_PGTABLE_ORDER);
	231	if (p == NULL)
	232	return NULL;
	233
	234	pgtable_page_ctor(p);
	235	return p;
	236	}
	237
	238	/*
	239	* Free page immediately (used in __pte_alloc if we raced with another
	240	* process). We have to correct whatever pte_alloc_one() did before
	241	* returning the pages to the allocator.
	242	*/
	243	void pte_free(struct mm_struct mm, struct page p)
	244	{
	245	pgtable_page_dtor(p);
	246	__free_pages(p, L2_USER_PGTABLE_ORDER);
	247	}
	248
	249	void __pte_free_tlb(struct mmu_gather tlb, struct page pte,
	250	unsigned long address)
	251	{
	252	int i;
	253
	254	pgtable_page_dtor(pte);
	255	tlb->need_flush = 1;
	256	if (tlb_fast_mode(tlb)) {
	257	struct page *pte_pages[L2_USER_PGTABLE_PAGES];
	258	for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i)
	259	pte_pages[i] = pte + i;
	260	free_pages_and_swap_cache(pte_pages, L2_USER_PGTABLE_PAGES);
	261	return;
	262	}
	263	for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) {
	264	tlb->pages[tlb->nr++] = pte + i;
	265	if (tlb->nr >= FREE_PTE_NR)
	266	tlb_flush_mmu(tlb, 0, 0);
	267	}
	268	}
	269
	270	#ifndef __tilegx__
	271
	272	/*
	273	* FIXME: needs to be atomic vs hypervisor writes. For now we make the
	274	* window of vulnerability a bit smaller by doing an unlocked 8-bit update.
	275	*/
	276	int ptep_test_and_clear_young(struct vm_area_struct *vma,
	277	unsigned long addr, pte_t *ptep)
	278	{
	279	#if HV_PTE_INDEX_ACCESSED < 8 \|\| HV_PTE_INDEX_ACCESSED >= 16
	280	# error Code assumes HV_PTE "accessed" bit in second byte
	281	#endif
	282	u8 tmp = (u8 )ptep;
	283	u8 second_byte = tmp[1];
	284	if (!(second_byte & (1 << (HV_PTE_INDEX_ACCESSED - 8))))
	285	return 0;
	286	tmp[1] = second_byte & ~(1 << (HV_PTE_INDEX_ACCESSED - 8));
	287	return 1;
288	}
289
290	/*
291	* This implementation is atomic vs hypervisor writes, since the hypervisor
292	* always writes the low word (where "accessed" and "dirty" are) and this
293	* routine only writes the high word.
294	*/
295	void ptep_set_wrprotect(struct mm_struct *mm,
296	unsigned long addr, pte_t *ptep)
297	{
298	#if HV_PTE_INDEX_WRITABLE < 32
299	# error Code assumes HV_PTE "writable" bit in high word
300	#endif
301	u32 tmp = (u32 )ptep;
302	tmp[1] = tmp[1] & ~(1 << (HV_PTE_INDEX_WRITABLE - 32));
303	}
304
305	#endif
306
307	pte_t virt_to_pte(struct mm_struct mm, unsigned long addr)
308	{
309	pgd_t *pgd;
310	pud_t *pud;
311	pmd_t *pmd;
312
313	if (pgd_addr_invalid(addr))
314	return NULL;
315
316	pgd = mm ? pgd_offset(mm, addr) : swapper_pg_dir + pgd_index(addr);
317	pud = pud_offset(pgd, addr);
318	if (!pud_present(*pud))
319	return NULL;
320	pmd = pmd_offset(pud, addr);
321	if (pmd_huge_page(*pmd))
322	return (pte_t *)pmd;
323	if (!pmd_present(*pmd))
324	return NULL;
325	return pte_offset_kernel(pmd, addr);
326	}
327
328	pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu)
329	{
330	unsigned int width = smp_width;
331	int x = cpu % width;
332	int y = cpu / width;
333	BUG_ON(y >= smp_height);
334	BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
335	BUG_ON(cpu < 0 \|\| cpu >= NR_CPUS);
336	BUG_ON(!cpu_is_valid_lotar(cpu));
337	return hv_pte_set_lotar(prot, HV_XY_TO_LOTAR(x, y));
338	}
339
340	int get_remote_cache_cpu(pgprot_t prot)
341	{
342	HV_LOTAR lotar = hv_pte_get_lotar(prot);
343	int x = HV_LOTAR_X(lotar);
344	int y = HV_LOTAR_Y(lotar);
345	BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
346	return x + y * smp_width;
347	}
348
349	void set_pte_order(pte_t *ptep, pte_t pte, int order)
350	{
351	unsigned long pfn = pte_pfn(pte);
352	struct page *page = pfn_to_page(pfn);
353
354	/* Update the home of a PTE if necessary */
355	pte = pte_set_home(pte, page_home(page));
356
357	#ifdef __tilegx__
358	*ptep = pte;
359	#else
360	/*
361	* When setting a PTE, write the high bits first, then write
362	* the low bits. This sets the "present" bit only after the
363	* other bits are in place. If a particular PTE update
364	* involves transitioning from one valid PTE to another, it
365	* may be necessary to call set_pte_order() more than once,
366	* transitioning via a suitable intermediate state.
367	* Note that this sequence also means that if we are transitioning
368	* from any migrating PTE to a non-migrating one, we will not
369	* see a half-updated PTE with the migrating bit off.
370	*/
371	#if HV_PTE_INDEX_PRESENT >= 32 \|\| HV_PTE_INDEX_MIGRATING >= 32
372	# error Must write the present and migrating bits last
373	#endif
374	((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
375	barrier();
376	((u32 *)ptep)[0] = (u32)(pte_val(pte));
377	#endif
378	}
379
380	/* Can this mm load a PTE with cached_priority set? */
381	static inline int mm_is_priority_cached(struct mm_struct *mm)
382	{
383	return mm->context.priority_cached;
384	}
385
386	/*
387	* Add a priority mapping to an mm_context and
388	* notify the hypervisor if this is the first one.
389	*/
390	void start_mm_caching(struct mm_struct *mm)
391	{
392	if (!mm_is_priority_cached(mm)) {
393	mm->context.priority_cached = -1U;
394	hv_set_caching(-1U);
395	}
396	}
397
398	/*
399	* Validate and return the priority_cached flag. We know if it's zero
400	* that we don't need to scan, since we immediately set it non-zero
401	* when we first consider a MAP_CACHE_PRIORITY mapping.
402	*
403	* We only _try_ to acquire the mmap_sem semaphore; if we can't acquire it,
404	* since we're in an interrupt context (servicing switch_mm) we don't
405	* worry about it and don't unset the "priority_cached" field.
406	* Presumably we'll come back later and have more luck and clear
407	* the value then; for now we'll just keep the cache marked for priority.
408	*/
409	static unsigned int update_priority_cached(struct mm_struct *mm)
410	{
411	if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) {
412	struct vm_area_struct *vm;
413	for (vm = mm->mmap; vm; vm = vm->vm_next) {
414	if (hv_pte_get_cached_priority(vm->vm_page_prot))
415	break;
416	}
417	if (vm == NULL)
418	mm->context.priority_cached = 0;
419	up_write(&mm->mmap_sem);
420	}
421	return mm->context.priority_cached;
422	}
423
424	/* Set caching correctly for an mm that we are switching to. */
425	void check_mm_caching(struct mm_struct prev, struct mm_struct next)
426	{
427	if (!mm_is_priority_cached(next)) {
428	/*
429	* If the new mm doesn't use priority caching, just see if we
430	* need the hv_set_caching(), or can assume it's already zero.
431	*/
432	if (mm_is_priority_cached(prev))
433	hv_set_caching(0);
434	} else {
435	hv_set_caching(update_priority_cached(next));
436	}
437	}
438
439	#if CHIP_HAS_MMIO()
440
441	/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
442	void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
443	pgprot_t home)
444	{
445	void *addr;
446	struct vm_struct *area;
447	unsigned long offset, last_addr;
448	pgprot_t pgprot;
449
450	/* Don't allow wraparound or zero size */
451	last_addr = phys_addr + size - 1;
452	if (!size \|\| last_addr < phys_addr)
453	return NULL;
454
455	/* Create a read/write, MMIO VA mapping homed at the requested shim. */
456	pgprot = PAGE_KERNEL;
457	pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
458	pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
459
460	/*
461	* Mappings have to be page-aligned
462	*/
463	offset = phys_addr & ~PAGE_MASK;
464	phys_addr &= PAGE_MASK;
465	size = PAGE_ALIGN(last_addr+1) - phys_addr;
466
467	/*
468	* Ok, go for it..
469	*/
470	area = get_vm_area(size, VM_IOREMAP /* \| other flags? */);
471	if (!area)
472	return NULL;
473	area->phys_addr = phys_addr;
474	addr = area->addr;
475	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
476	phys_addr, pgprot)) {
477	remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
478	return NULL;
479	}
480	return (__force void __iomem ) (offset + (char )addr);
481	}
482	EXPORT_SYMBOL(ioremap_prot);
483
484	/* Map a PCI MMIO bus address into VA space. */
485	void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
486	{
487	panic("ioremap for PCI MMIO is not supported");
488	}
489	EXPORT_SYMBOL(ioremap);
490
491	/* Unmap an MMIO VA mapping. */
492	void iounmap(volatile void __iomem *addr_in)
493	{
494	volatile void __iomem addr = (volatile void __iomem )
495	(PAGE_MASK & (unsigned long __force)addr_in);
496	#if 1
497	vunmap((void * __force)addr);
498	#else
499	/* x86 uses this complicated flow instead of vunmap(). Is
500	* there any particular reason we should do the same? */
501	struct vm_struct p, o;
502
503	/* Use the vm area unlocked, assuming the caller
504	ensures there isn't another iounmap for the same address
505	in parallel. Reuse of the virtual address is prevented by
506	leaving it in the global lists until we're done with it.
507	cpa takes care of the direct mappings. */
508	read_lock(&vmlist_lock);
509	for (p = vmlist; p; p = p->next) {
510	if (p->addr == addr)
511	break;
512	}
513	read_unlock(&vmlist_lock);
514
515	if (!p) {
0707ad30	516	pr_err("iounmap: bad address %p\n", addr);
867e359b CM	517	dump_stack();
	518	return;
	519	}
	520
	521	/* Finally remove it */
	522	o = remove_vm_area((void *)addr);
	523	BUG_ON(p != o \|\| o == NULL);
	524	kfree(p);
	525	#endif
	526	}
	527	EXPORT_SYMBOL(iounmap);
	528
	529	#endif /* CHIP_HAS_MMIO() */