[deliverable/linux.git] / arch / ia64 / mm / init.c

/*
 * Initialize MMU support.
 *
 * Copyright (C) 1998-2003 Hewlett-Packard Co
 *	David Mosberger-Tang <davidm@hpl.hp.com>
 */
#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/init.h>

#include <linux/bootmem.h>
#include <linux/efi.h>
#include <linux/elf.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/module.h>
#include <linux/personality.h>
#include <linux/reboot.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/proc_fs.h>
#include <linux/bitops.h>

#include <asm/a.out.h>
#include <asm/dma.h>
#include <asm/ia32.h>
#include <asm/io.h>
#include <asm/machvec.h>
#include <asm/numa.h>
#include <asm/patch.h>
#include <asm/pgalloc.h>
#include <asm/sal.h>
#include <asm/sections.h>
#include <asm/system.h>
#include <asm/tlb.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/mca.h>

DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);

extern void ia64_tlb_init (void);

unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL;

#ifdef CONFIG_VIRTUAL_MEM_MAP
unsigned long vmalloc_end = VMALLOC_END_INIT;
EXPORT_SYMBOL(vmalloc_end);
struct page *vmem_map;
EXPORT_SYMBOL(vmem_map);
#endif

static int pgt_cache_water[2] = { 25, 50 };

struct page *zero_page_memmap_ptr;		/* map entry for zero page */
EXPORT_SYMBOL(zero_page_memmap_ptr);

void
check_pgt_cache (void)
{
	int low, high;

	low = pgt_cache_water[0];
	high = pgt_cache_water[1];

	preempt_disable();
	if (pgtable_cache_size > (u64) high) {
		do {
			if (pgd_quicklist)
				free_page((unsigned long)pgd_alloc_one_fast(NULL));
			if (pmd_quicklist)
				free_page((unsigned long)pmd_alloc_one_fast(NULL, 0));
		} while (pgtable_cache_size > (u64) low);
	}
	preempt_enable();
}

void
lazy_mmu_prot_update (pte_t pte)
{
	unsigned long addr;
	struct page *page;

	if (!pte_exec(pte))
		return;				/* not an executable page... */

	page = pte_page(pte);
	addr = (unsigned long) page_address(page);

	if (test_bit(PG_arch_1, &page->flags))
		return;				/* i-cache is already coherent with d-cache */

	flush_icache_range(addr, addr + PAGE_SIZE);
	set_bit(PG_arch_1, &page->flags);	/* mark page as clean */
}

inline void
ia64_set_rbs_bot (void)
{
	unsigned long stack_size = current->signal->rlim[RLIMIT_STACK].rlim_max & -16;

	if (stack_size > MAX_USER_STACK_SIZE)
		stack_size = MAX_USER_STACK_SIZE;
	current->thread.rbs_bot = STACK_TOP - stack_size;
}

/*
 * This performs some platform-dependent address space initialization.
 * On IA-64, we want to setup the VM area for the register backing
 * store (which grows upwards) and install the gateway page which is
 * used for signal trampolines, etc.
 */
void
ia64_init_addr_space (void)
{
	struct vm_area_struct *vma;

	ia64_set_rbs_bot();

	/*
	 * If we're out of memory and kmem_cache_alloc() returns NULL, we simply ignore
	 * the problem.  When the process attempts to write to the register backing store
	 * for the first time, it will get a SEGFAULT in this case.
	 */
	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
	if (vma) {
		memset(vma, 0, sizeof(*vma));
		vma->vm_mm = current->mm;
		vma->vm_start = current->thread.rbs_bot & PAGE_MASK;
		vma->vm_end = vma->vm_start + PAGE_SIZE;
		vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7];
		vma->vm_flags = VM_DATA_DEFAULT_FLAGS | VM_GROWSUP;
		down_write(&current->mm->mmap_sem);
		if (insert_vm_struct(current->mm, vma)) {
			up_write(&current->mm->mmap_sem);
			kmem_cache_free(vm_area_cachep, vma);
			return;
		}
		up_write(&current->mm->mmap_sem);
	}

	/* map NaT-page at address zero to speed up speculative dereferencing of NULL: */
	if (!(current->personality & MMAP_PAGE_ZERO)) {
		vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
		if (vma) {
			memset(vma, 0, sizeof(*vma));
			vma->vm_mm = current->mm;
			vma->vm_end = PAGE_SIZE;
			vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT);
			vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED;
			down_write(&current->mm->mmap_sem);
			if (insert_vm_struct(current->mm, vma)) {
				up_write(&current->mm->mmap_sem);
				kmem_cache_free(vm_area_cachep, vma);
				return;
			}
			up_write(&current->mm->mmap_sem);
		}
	}
}

void
free_initmem (void)
{
	unsigned long addr, eaddr;

	addr = (unsigned long) ia64_imva(__init_begin);
	eaddr = (unsigned long) ia64_imva(__init_end);
	while (addr < eaddr) {
		ClearPageReserved(virt_to_page(addr));
		set_page_count(virt_to_page(addr), 1);
		free_page(addr);
		++totalram_pages;
		addr += PAGE_SIZE;
	}
	printk(KERN_INFO "Freeing unused kernel memory: %ldkB freed\n",
	       (__init_end - __init_begin) >> 10);
}

void
free_initrd_mem (unsigned long start, unsigned long end)
{
	struct page *page;
	/*
	 * EFI uses 4KB pages while the kernel can use 4KB or bigger.
	 * Thus EFI and the kernel may have different page sizes. It is
	 * therefore possible to have the initrd share the same page as
	 * the end of the kernel (given current setup).
	 *
	 * To avoid freeing/using the wrong page (kernel sized) we:
	 *	- align up the beginning of initrd
	 *	- align down the end of initrd
	 *
	 *  |             |
	 *  |=============| a000
	 *  |             |
	 *  |             |
	 *  |             | 9000
	 *  |/////////////|
	 *  |/////////////|
	 *  |=============| 8000
	 *  |///INITRD////|
	 *  |/////////////|
	 *  |/////////////| 7000
	 *  |             |
	 *  |KKKKKKKKKKKKK|
	 *  |=============| 6000
	 *  |KKKKKKKKKKKKK|
	 *  |KKKKKKKKKKKKK|
	 *  K=kernel using 8KB pages
	 *
	 * In this example, we must free page 8000 ONLY. So we must align up
	 * initrd_start and keep initrd_end as is.
	 */
	start = PAGE_ALIGN(start);
	end = end & PAGE_MASK;

	if (start < end)
		printk(KERN_INFO "Freeing initrd memory: %ldkB freed\n", (end - start) >> 10);

	for (; start < end; start += PAGE_SIZE) {
		if (!virt_addr_valid(start))
			continue;
		page = virt_to_page(start);
		ClearPageReserved(page);
		set_page_count(page, 1);
		free_page(start);
		++totalram_pages;
	}
}

/*
 * This installs a clean page in the kernel's page table.
 */
struct page *
put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	if (!PageReserved(page))
		printk(KERN_ERR "put_kernel_page: page at 0x%p not in reserved memory\n",
		       page_address(page));

	pgd = pgd_offset_k(address);		/* note: this is NOT pgd_offset()! */

	spin_lock(&init_mm.page_table_lock);
	{
		pud = pud_alloc(&init_mm, pgd, address);
		if (!pud)
			goto out;

		pmd = pmd_alloc(&init_mm, pud, address);
		if (!pmd)
			goto out;
		pte = pte_alloc_map(&init_mm, pmd, address);
		if (!pte)
			goto out;
		if (!pte_none(*pte)) {
			pte_unmap(pte);
			goto out;
		}
		set_pte(pte, mk_pte(page, pgprot));
		pte_unmap(pte);
	}
  out:	spin_unlock(&init_mm.page_table_lock);
	/* no need for flush_tlb */
	return page;
}

static void
setup_gate (void)
{
	struct page *page;

	/*
	 * Map the gate page twice: once read-only to export the ELF headers etc. and once
	 * execute-only page to enable privilege-promotion via "epc":
	 */
	page = virt_to_page(ia64_imva(__start_gate_section));
	put_kernel_page(page, GATE_ADDR, PAGE_READONLY);
#ifdef HAVE_BUGGY_SEGREL
	page = virt_to_page(ia64_imva(__start_gate_section + PAGE_SIZE));
	put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE);
#else
	put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE);
#endif
	ia64_patch_gate();
}

void __devinit
ia64_mmu_init (void *my_cpu_data)
{
	unsigned long psr, pta, impl_va_bits;
	extern void __devinit tlb_init (void);

#ifdef CONFIG_DISABLE_VHPT
#	define VHPT_ENABLE_BIT	0
#else
#	define VHPT_ENABLE_BIT	1
#endif

	/* Pin mapping for percpu area into TLB */
	psr = ia64_clear_ic();
	ia64_itr(0x2, IA64_TR_PERCPU_DATA, PERCPU_ADDR,
		 pte_val(pfn_pte(__pa(my_cpu_data) >> PAGE_SHIFT, PAGE_KERNEL)),
		 PERCPU_PAGE_SHIFT);

	ia64_set_psr(psr);
	ia64_srlz_i();

	/*
	 * Check if the virtually mapped linear page table (VMLPT) overlaps with a mapped
	 * address space.  The IA-64 architecture guarantees that at least 50 bits of
	 * virtual address space are implemented but if we pick a large enough page size
	 * (e.g., 64KB), the mapped address space is big enough that it will overlap with
	 * VMLPT.  I assume that once we run on machines big enough to warrant 64KB pages,
	 * IMPL_VA_MSB will be significantly bigger, so this is unlikely to become a
	 * problem in practice.  Alternatively, we could truncate the top of the mapped
	 * address space to not permit mappings that would overlap with the VMLPT.
	 * --davidm 00/12/06
	 */
#	define pte_bits			3
#	define mapped_space_bits	(3*(PAGE_SHIFT - pte_bits) + PAGE_SHIFT)
	/*
	 * The virtual page table has to cover the entire implemented address space within
	 * a region even though not all of this space may be mappable.  The reason for
	 * this is that the Access bit and Dirty bit fault handlers perform
	 * non-speculative accesses to the virtual page table, so the address range of the
	 * virtual page table itself needs to be covered by virtual page table.
	 */
#	define vmlpt_bits		(impl_va_bits - PAGE_SHIFT + pte_bits)
#	define POW2(n)			(1ULL << (n))

	impl_va_bits = ffz(~(local_cpu_data->unimpl_va_mask | (7UL << 61)));

	if (impl_va_bits < 51 || impl_va_bits > 61)
		panic("CPU has bogus IMPL_VA_MSB value of %lu!\n", impl_va_bits - 1);

	/* place the VMLPT at the end of each page-table mapped region: */
	pta = POW2(61) - POW2(vmlpt_bits);

	if (POW2(mapped_space_bits) >= pta)
		panic("mm/init: overlap between virtually mapped linear page table and "
		      "mapped kernel space!");
	/*
	 * Set the (virtually mapped linear) page table address.  Bit
	 * 8 selects between the short and long format, bits 2-7 the
	 * size of the table, and bit 0 whether the VHPT walker is
	 * enabled.
	 */
	ia64_set_pta(pta | (0 << 8) | (vmlpt_bits << 2) | VHPT_ENABLE_BIT);

	ia64_tlb_init();

#ifdef	CONFIG_HUGETLB_PAGE
	ia64_set_rr(HPAGE_REGION_BASE, HPAGE_SHIFT << 2);
	ia64_srlz_d();
#endif
}

#ifdef CONFIG_VIRTUAL_MEM_MAP

int
create_mem_map_page_table (u64 start, u64 end, void *arg)
{
	unsigned long address, start_page, end_page;
	struct page *map_start, *map_end;
	int node;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	map_start = vmem_map + (__pa(start) >> PAGE_SHIFT);
	map_end   = vmem_map + (__pa(end) >> PAGE_SHIFT);

	start_page = (unsigned long) map_start & PAGE_MASK;
	end_page = PAGE_ALIGN((unsigned long) map_end);
	node = paddr_to_nid(__pa(start));

	for (address = start_page; address < end_page; address += PAGE_SIZE) {
		pgd = pgd_offset_k(address);
		if (pgd_none(*pgd))
			pgd_populate(&init_mm, pgd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
		pud = pud_offset(pgd, address);

		if (pud_none(*pud))
			pud_populate(&init_mm, pud, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
		pmd = pmd_offset(pud, address);

		if (pmd_none(*pmd))
			pmd_populate_kernel(&init_mm, pmd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
		pte = pte_offset_kernel(pmd, address);

		if (pte_none(*pte))
			set_pte(pte, pfn_pte(__pa(alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)) >> PAGE_SHIFT,
					     PAGE_KERNEL));
	}
	return 0;
}

struct memmap_init_callback_data {
	struct page *start;
	struct page *end;
	int nid;
	unsigned long zone;
};

static int
virtual_memmap_init (u64 start, u64 end, void *arg)
{
	struct memmap_init_callback_data *args;
	struct page *map_start, *map_end;

	args = (struct memmap_init_callback_data *) arg;
	map_start = vmem_map + (__pa(start) >> PAGE_SHIFT);
	map_end   = vmem_map + (__pa(end) >> PAGE_SHIFT);

	if (map_start < args->start)
		map_start = args->start;
	if (map_end > args->end)
		map_end = args->end;

	/*
	 * We have to initialize "out of bounds" struct page elements that fit completely
	 * on the same pages that were allocated for the "in bounds" elements because they
	 * may be referenced later (and found to be "reserved").
	 */
	map_start -= ((unsigned long) map_start & (PAGE_SIZE - 1)) / sizeof(struct page);
	map_end += ((PAGE_ALIGN((unsigned long) map_end) - (unsigned long) map_end)
		    / sizeof(struct page));

	if (map_start < map_end)
		memmap_init_zone((unsigned long)(map_end - map_start),
				 args->nid, args->zone, page_to_pfn(map_start));
	return 0;
}

void
memmap_init (unsigned long size, int nid, unsigned long zone,
	     unsigned long start_pfn)
{
	if (!vmem_map)
		memmap_init_zone(size, nid, zone, start_pfn);
	else {
		struct page *start;
		struct memmap_init_callback_data args;

		start = pfn_to_page(start_pfn);
		args.start = start;
		args.end = start + size;
		args.nid = nid;
		args.zone = zone;

		efi_memmap_walk(virtual_memmap_init, &args);
	}
}

int
ia64_pfn_valid (unsigned long pfn)
{
	char byte;
	struct page *pg = pfn_to_page(pfn);

	return     (__get_user(byte, (char __user *) pg) == 0)
		&& ((((u64)pg & PAGE_MASK) == (((u64)(pg + 1) - 1) & PAGE_MASK))
			|| (__get_user(byte, (char __user *) (pg + 1) - 1) == 0));
}
EXPORT_SYMBOL(ia64_pfn_valid);

int
find_largest_hole (u64 start, u64 end, void *arg)
{
	u64 *max_gap = arg;

	static u64 last_end = PAGE_OFFSET;

	/* NOTE: this algorithm assumes efi memmap table is ordered */

	if (*max_gap < (start - last_end))
		*max_gap = start - last_end;
	last_end = end;
	return 0;
}
#endif /* CONFIG_VIRTUAL_MEM_MAP */

static int
count_reserved_pages (u64 start, u64 end, void *arg)
{
	unsigned long num_reserved = 0;
	unsigned long *count = arg;

	for (; start < end; start += PAGE_SIZE)
		if (PageReserved(virt_to_page(start)))
			++num_reserved;
	*count += num_reserved;
	return 0;
}

/*
 * Boot command-line option "nolwsys" can be used to disable the use of any light-weight
 * system call handler.  When this option is in effect, all fsyscalls will end up bubbling
 * down into the kernel and calling the normal (heavy-weight) syscall handler.  This is
 * useful for performance testing, but conceivably could also come in handy for debugging
 * purposes.
 */

static int nolwsys;

static int __init
nolwsys_setup (char *s)
{
	nolwsys = 1;
	return 1;
}

__setup("nolwsys", nolwsys_setup);

void
mem_init (void)
{
	long reserved_pages, codesize, datasize, initsize;
	unsigned long num_pgt_pages;
	pg_data_t *pgdat;
	int i;
	static struct kcore_list kcore_mem, kcore_vmem, kcore_kernel;

#ifdef CONFIG_PCI
	/*
	 * This needs to be called _after_ the command line has been parsed but _before_
	 * any drivers that may need the PCI DMA interface are initialized or bootmem has
	 * been freed.
	 */
	platform_dma_init();
#endif

#ifndef CONFIG_DISCONTIGMEM
	if (!mem_map)
		BUG();
	max_mapnr = max_low_pfn;
#endif

	high_memory = __va(max_low_pfn * PAGE_SIZE);

	kclist_add(&kcore_mem, __va(0), max_low_pfn * PAGE_SIZE);
	kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
	kclist_add(&kcore_kernel, _stext, _end - _stext);

	for_each_pgdat(pgdat)
		totalram_pages += free_all_bootmem_node(pgdat);

	reserved_pages = 0;
	efi_memmap_walk(count_reserved_pages, &reserved_pages);

	codesize =  (unsigned long) _etext - (unsigned long) _stext;
	datasize =  (unsigned long) _edata - (unsigned long) _etext;
	initsize =  (unsigned long) __init_end - (unsigned long) __init_begin;

	printk(KERN_INFO "Memory: %luk/%luk available (%luk code, %luk reserved, "
	       "%luk data, %luk init)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT - 10),
	       num_physpages << (PAGE_SHIFT - 10), codesize >> 10,
	       reserved_pages << (PAGE_SHIFT - 10), datasize >> 10, initsize >> 10);

	/*
	 * Allow for enough (cached) page table pages so that we can map the entire memory
	 * at least once.  Each task also needs a couple of page tables pages, so add in a
	 * fudge factor for that (don't use "threads-max" here; that would be wrong!).
	 * Don't allow the cache to be more than 10% of total memory, though.
	 */
#	define NUM_TASKS	500	/* typical number of tasks */
	num_pgt_pages = nr_free_pages() / PTRS_PER_PGD + NUM_TASKS;
	if (num_pgt_pages > nr_free_pages() / 10)
		num_pgt_pages = nr_free_pages() / 10;
	if (num_pgt_pages > (u64) pgt_cache_water[1])
		pgt_cache_water[1] = num_pgt_pages;

	/*
	 * For fsyscall entrpoints with no light-weight handler, use the ordinary
	 * (heavy-weight) handler, but mark it by setting bit 0, so the fsyscall entry
	 * code can tell them apart.
	 */
	for (i = 0; i < NR_syscalls; ++i) {
		extern unsigned long fsyscall_table[NR_syscalls];
		extern unsigned long sys_call_table[NR_syscalls];

		if (!fsyscall_table[i] || nolwsys)
			fsyscall_table[i] = sys_call_table[i] | 1;
	}
	setup_gate();

#ifdef CONFIG_IA32_SUPPORT
	ia32_mem_init();
#endif
}
Commit	Line	Data
1da177e4 LT	1	/*
	2	* Initialize MMU support.
	3	*
	4	* Copyright (C) 1998-2003 Hewlett-Packard Co
	5	* David Mosberger-Tang <davidm@hpl.hp.com>
	6	*/
	7	#include <linux/config.h>
	8	#include <linux/kernel.h>
	9	#include <linux/init.h>
	10
	11	#include <linux/bootmem.h>
	12	#include <linux/efi.h>
	13	#include <linux/elf.h>
	14	#include <linux/mm.h>
	15	#include <linux/mmzone.h>
	16	#include <linux/module.h>
	17	#include <linux/personality.h>
	18	#include <linux/reboot.h>
	19	#include <linux/slab.h>
	20	#include <linux/swap.h>
	21	#include <linux/proc_fs.h>
	22	#include <linux/bitops.h>
	23
	24	#include <asm/a.out.h>
	25	#include <asm/dma.h>
	26	#include <asm/ia32.h>
	27	#include <asm/io.h>
	28	#include <asm/machvec.h>
	29	#include <asm/numa.h>
	30	#include <asm/patch.h>
	31	#include <asm/pgalloc.h>
	32	#include <asm/sal.h>
	33	#include <asm/sections.h>
	34	#include <asm/system.h>
	35	#include <asm/tlb.h>
	36	#include <asm/uaccess.h>
	37	#include <asm/unistd.h>
	38	#include <asm/mca.h>
	39
	40	DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
	41
	42	extern void ia64_tlb_init (void);
	43
	44	unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL;
	45
	46	#ifdef CONFIG_VIRTUAL_MEM_MAP
	47	unsigned long vmalloc_end = VMALLOC_END_INIT;
	48	EXPORT_SYMBOL(vmalloc_end);
	49	struct page *vmem_map;
	50	EXPORT_SYMBOL(vmem_map);
	51	#endif
	52
	53	static int pgt_cache_water[2] = { 25, 50 };
	54
	55	struct page zero_page_memmap_ptr; / map entry for zero page */
	56	EXPORT_SYMBOL(zero_page_memmap_ptr);
	57
	58	void
	59	check_pgt_cache (void)
	60	{
	61	int low, high;
	62
	63	low = pgt_cache_water[0];
	64	high = pgt_cache_water[1];
65
66	preempt_disable();
67	if (pgtable_cache_size > (u64) high) {
68	do {
69	if (pgd_quicklist)
70	free_page((unsigned long)pgd_alloc_one_fast(NULL));
71	if (pmd_quicklist)
72	free_page((unsigned long)pmd_alloc_one_fast(NULL, 0));
73	} while (pgtable_cache_size > (u64) low);
74	}
75	preempt_enable();
76	}
77
78	void
79	lazy_mmu_prot_update (pte_t pte)
80	{
81	unsigned long addr;
82	struct page *page;
83
84	if (!pte_exec(pte))
85	return; /* not an executable page... */
86
87	page = pte_page(pte);
88	addr = (unsigned long) page_address(page);
89
90	if (test_bit(PG_arch_1, &page->flags))
91	return; /* i-cache is already coherent with d-cache */
92
93	flush_icache_range(addr, addr + PAGE_SIZE);
94	set_bit(PG_arch_1, &page->flags); /* mark page as clean */
95	}
96
97	inline void
98	ia64_set_rbs_bot (void)
99	{
100	unsigned long stack_size = current->signal->rlim[RLIMIT_STACK].rlim_max & -16;
101
102	if (stack_size > MAX_USER_STACK_SIZE)
103	stack_size = MAX_USER_STACK_SIZE;
104	current->thread.rbs_bot = STACK_TOP - stack_size;
105	}
106
107	/*
108	* This performs some platform-dependent address space initialization.
109	* On IA-64, we want to setup the VM area for the register backing
110	* store (which grows upwards) and install the gateway page which is
111	* used for signal trampolines, etc.
112	*/
113	void
114	ia64_init_addr_space (void)
115	{
116	struct vm_area_struct *vma;
117
118	ia64_set_rbs_bot();
119
120	/*
121	* If we're out of memory and kmem_cache_alloc() returns NULL, we simply ignore
122	* the problem. When the process attempts to write to the register backing store
123	* for the first time, it will get a SEGFAULT in this case.
124	*/
125	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
126	if (vma) {
127	memset(vma, 0, sizeof(*vma));
128	vma->vm_mm = current->mm;
129	vma->vm_start = current->thread.rbs_bot & PAGE_MASK;
130	vma->vm_end = vma->vm_start + PAGE_SIZE;
131	vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7];
132	vma->vm_flags = VM_DATA_DEFAULT_FLAGS \| VM_GROWSUP;
133	down_write(&current->mm->mmap_sem);
134	if (insert_vm_struct(current->mm, vma)) {
135	up_write(&current->mm->mmap_sem);
136	kmem_cache_free(vm_area_cachep, vma);
137	return;
138	}
139	up_write(&current->mm->mmap_sem);
140	}
141
142	/* map NaT-page at address zero to speed up speculative dereferencing of NULL: */
143	if (!(current->personality & MMAP_PAGE_ZERO)) {
144	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
145	if (vma) {
146	memset(vma, 0, sizeof(*vma));
147	vma->vm_mm = current->mm;
148	vma->vm_end = PAGE_SIZE;
149	vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) \| _PAGE_MA_NAT);
150	vma->vm_flags = VM_READ \| VM_MAYREAD \| VM_IO \| VM_RESERVED;
151	down_write(&current->mm->mmap_sem);
152	if (insert_vm_struct(current->mm, vma)) {
153	up_write(&current->mm->mmap_sem);
154	kmem_cache_free(vm_area_cachep, vma);
155	return;
156	}
157	up_write(&current->mm->mmap_sem);
158	}
159	}
160	}
161
162	void
163	free_initmem (void)
164	{
165	unsigned long addr, eaddr;
166
167	addr = (unsigned long) ia64_imva(__init_begin);
168	eaddr = (unsigned long) ia64_imva(__init_end);
169	while (addr < eaddr) {
170	ClearPageReserved(virt_to_page(addr));
171	set_page_count(virt_to_page(addr), 1);
172	free_page(addr);
173	++totalram_pages;
174	addr += PAGE_SIZE;
175	}
176	printk(KERN_INFO "Freeing unused kernel memory: %ldkB freed\n",
177	(__init_end - __init_begin) >> 10);
178	}
179
180	void
181	free_initrd_mem (unsigned long start, unsigned long end)
182	{
183	struct page *page;
184	/*
185	* EFI uses 4KB pages while the kernel can use 4KB or bigger.
186	* Thus EFI and the kernel may have different page sizes. It is
187	* therefore possible to have the initrd share the same page as
188	* the end of the kernel (given current setup).
189	*
190	* To avoid freeing/using the wrong page (kernel sized) we:
191	* - align up the beginning of initrd
192	* - align down the end of initrd
193	*
194	* \| \|
195	* \|=============\| a000
196	* \| \|
197	* \| \|
198	* \| \| 9000
199	* \|/////////////\|
200	* \|/////////////\|
201	* \|=============\| 8000
202	* \|///INITRD////\|
203	* \|/////////////\|
204	* \|/////////////\| 7000
205	* \| \|
206	* \|KKKKKKKKKKKKK\|
207	* \|=============\| 6000
208	* \|KKKKKKKKKKKKK\|
209	* \|KKKKKKKKKKKKK\|
210	* K=kernel using 8KB pages
211	*
212	* In this example, we must free page 8000 ONLY. So we must align up
213	* initrd_start and keep initrd_end as is.
214	*/
215	start = PAGE_ALIGN(start);
216	end = end & PAGE_MASK;
217
218	if (start < end)
219	printk(KERN_INFO "Freeing initrd memory: %ldkB freed\n", (end - start) >> 10);
220
221	for (; start < end; start += PAGE_SIZE) {
222	if (!virt_addr_valid(start))
223	continue;
224	page = virt_to_page(start);
225	ClearPageReserved(page);
226	set_page_count(page, 1);
227	free_page(start);
228	++totalram_pages;
229	}
230	}
231
232	/*
233	* This installs a clean page in the kernel's page table.
234	*/
235	struct page *
236	put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
237	{
238	pgd_t *pgd;
239	pud_t *pud;
240	pmd_t *pmd;
241	pte_t *pte;
242
243	if (!PageReserved(page))
244	printk(KERN_ERR "put_kernel_page: page at 0x%p not in reserved memory\n",
245	page_address(page));
246
247	pgd = pgd_offset_k(address); /* note: this is NOT pgd_offset()! */
248
249	spin_lock(&init_mm.page_table_lock);
250	{
251	pud = pud_alloc(&init_mm, pgd, address);
252	if (!pud)
253	goto out;
254
255	pmd = pmd_alloc(&init_mm, pud, address);
256	if (!pmd)
257	goto out;
258	pte = pte_alloc_map(&init_mm, pmd, address);
259	if (!pte)
260	goto out;
261	if (!pte_none(*pte)) {
262	pte_unmap(pte);
263	goto out;
264	}
265	set_pte(pte, mk_pte(page, pgprot));
266	pte_unmap(pte);
267	}
268	out: spin_unlock(&init_mm.page_table_lock);
269	/* no need for flush_tlb */
270	return page;
271	}
272
273	static void
274	setup_gate (void)
275	{
276	struct page *page;
277
278	/*
279	* Map the gate page twice: once read-only to export the ELF headers etc. and once
280	* execute-only page to enable privilege-promotion via "epc":
281	*/
282	page = virt_to_page(ia64_imva(__start_gate_section));
283	put_kernel_page(page, GATE_ADDR, PAGE_READONLY);
284	#ifdef HAVE_BUGGY_SEGREL
285	page = virt_to_page(ia64_imva(__start_gate_section + PAGE_SIZE));
286	put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE);
287	#else
288	put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE);
289	#endif
290	ia64_patch_gate();
291	}
292
293	void __devinit
294	ia64_mmu_init (void *my_cpu_data)
295	{
296	unsigned long psr, pta, impl_va_bits;
297	extern void __devinit tlb_init (void);
298
299	#ifdef CONFIG_DISABLE_VHPT
300	# define VHPT_ENABLE_BIT 0
301	#else
302	# define VHPT_ENABLE_BIT 1
303	#endif
304
305	/* Pin mapping for percpu area into TLB */
306	psr = ia64_clear_ic();
307	ia64_itr(0x2, IA64_TR_PERCPU_DATA, PERCPU_ADDR,
308	pte_val(pfn_pte(__pa(my_cpu_data) >> PAGE_SHIFT, PAGE_KERNEL)),
309	PERCPU_PAGE_SHIFT);
310
311	ia64_set_psr(psr);
312	ia64_srlz_i();
313
314	/*
315	* Check if the virtually mapped linear page table (VMLPT) overlaps with a mapped
316	* address space. The IA-64 architecture guarantees that at least 50 bits of
317	* virtual address space are implemented but if we pick a large enough page size
318	* (e.g., 64KB), the mapped address space is big enough that it will overlap with
319	* VMLPT. I assume that once we run on machines big enough to warrant 64KB pages,
320	* IMPL_VA_MSB will be significantly bigger, so this is unlikely to become a
321	* problem in practice. Alternatively, we could truncate the top of the mapped
322	* address space to not permit mappings that would overlap with the VMLPT.
323	* --davidm 00/12/06
324	*/
325	# define pte_bits 3
326	# define mapped_space_bits (3*(PAGE_SHIFT - pte_bits) + PAGE_SHIFT)
327	/*
328	* The virtual page table has to cover the entire implemented address space within
329	* a region even though not all of this space may be mappable. The reason for
330	* this is that the Access bit and Dirty bit fault handlers perform
331	* non-speculative accesses to the virtual page table, so the address range of the
332	* virtual page table itself needs to be covered by virtual page table.
333	*/
334	# define vmlpt_bits (impl_va_bits - PAGE_SHIFT + pte_bits)
335	# define POW2(n) (1ULL << (n))
336
337	impl_va_bits = ffz(~(local_cpu_data->unimpl_va_mask \| (7UL << 61)));
338
339	if (impl_va_bits < 51 \|\| impl_va_bits > 61)
340	panic("CPU has bogus IMPL_VA_MSB value of %lu!\n", impl_va_bits - 1);
341
342	/* place the VMLPT at the end of each page-table mapped region: */
343	pta = POW2(61) - POW2(vmlpt_bits);
344
345	if (POW2(mapped_space_bits) >= pta)
346	panic("mm/init: overlap between virtually mapped linear page table and "
347	"mapped kernel space!");
348	/*
349	* Set the (virtually mapped linear) page table address. Bit
350	* 8 selects between the short and long format, bits 2-7 the
351	* size of the table, and bit 0 whether the VHPT walker is
352	* enabled.
353	*/
354	ia64_set_pta(pta \| (0 << 8) \| (vmlpt_bits << 2) \| VHPT_ENABLE_BIT);
355
356	ia64_tlb_init();
357
358	#ifdef CONFIG_HUGETLB_PAGE
359	ia64_set_rr(HPAGE_REGION_BASE, HPAGE_SHIFT << 2);
360	ia64_srlz_d();
361	#endif
362	}
363
364	#ifdef CONFIG_VIRTUAL_MEM_MAP
365
366	int
367	create_mem_map_page_table (u64 start, u64 end, void *arg)
368	{
369	unsigned long address, start_page, end_page;
370	struct page map_start, map_end;
371	int node;
372	pgd_t *pgd;
373	pud_t *pud;
374	pmd_t *pmd;
375	pte_t *pte;
376
377	map_start = vmem_map + (__pa(start) >> PAGE_SHIFT);
378	map_end = vmem_map + (__pa(end) >> PAGE_SHIFT);
379
380	start_page = (unsigned long) map_start & PAGE_MASK;
381	end_page = PAGE_ALIGN((unsigned long) map_end);
382	node = paddr_to_nid(__pa(start));
383
384	for (address = start_page; address < end_page; address += PAGE_SIZE) {
385	pgd = pgd_offset_k(address);
386	if (pgd_none(*pgd))
387	pgd_populate(&init_mm, pgd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
388	pud = pud_offset(pgd, address);
389
390	if (pud_none(*pud))
391	pud_populate(&init_mm, pud, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
392	pmd = pmd_offset(pud, address);
393
394	if (pmd_none(*pmd))
395	pmd_populate_kernel(&init_mm, pmd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE));
396	pte = pte_offset_kernel(pmd, address);
397
398	if (pte_none(*pte))
399	set_pte(pte, pfn_pte(__pa(alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)) >> PAGE_SHIFT,
400	PAGE_KERNEL));
401	}
402	return 0;
403	}
404
405	struct memmap_init_callback_data {
406	struct page *start;
407	struct page *end;
408	int nid;
409	unsigned long zone;
410	};
411
412	static int
413	virtual_memmap_init (u64 start, u64 end, void *arg)
414	{
415	struct memmap_init_callback_data *args;
416	struct page map_start, map_end;
417
418	args = (struct memmap_init_callback_data *) arg;
419	map_start = vmem_map + (__pa(start) >> PAGE_SHIFT);
420	map_end = vmem_map + (__pa(end) >> PAGE_SHIFT);
421
422	if (map_start < args->start)
423	map_start = args->start;
424	if (map_end > args->end)
425	map_end = args->end;
426
427	/*
428	* We have to initialize "out of bounds" struct page elements that fit completely
429	* on the same pages that were allocated for the "in bounds" elements because they
430	* may be referenced later (and found to be "reserved").
431	*/
432	map_start -= ((unsigned long) map_start & (PAGE_SIZE - 1)) / sizeof(struct page);
433	map_end += ((PAGE_ALIGN((unsigned long) map_end) - (unsigned long) map_end)
434	/ sizeof(struct page));
435
436	if (map_start < map_end)
437	memmap_init_zone((unsigned long)(map_end - map_start),
438	args->nid, args->zone, page_to_pfn(map_start));
439	return 0;
440	}
441
442	void
443	memmap_init (unsigned long size, int nid, unsigned long zone,
444	unsigned long start_pfn)
445	{
446	if (!vmem_map)
447	memmap_init_zone(size, nid, zone, start_pfn);
448	else {
449	struct page *start;
450	struct memmap_init_callback_data args;
451
452	start = pfn_to_page(start_pfn);
453	args.start = start;
454	args.end = start + size;
455	args.nid = nid;
456	args.zone = zone;
457
458	efi_memmap_walk(virtual_memmap_init, &args);
459	}
460	}
461
462	int
463	ia64_pfn_valid (unsigned long pfn)
464	{
465	char byte;
466	struct page *pg = pfn_to_page(pfn);
467
468	return (__get_user(byte, (char __user *) pg) == 0)
469	&& ((((u64)pg & PAGE_MASK) == (((u64)(pg + 1) - 1) & PAGE_MASK))
470	\|\| (__get_user(byte, (char __user *) (pg + 1) - 1) == 0));
471	}
472	EXPORT_SYMBOL(ia64_pfn_valid);
473
474	int
475	find_largest_hole (u64 start, u64 end, void *arg)
476	{
477	u64 *max_gap = arg;
478
479	static u64 last_end = PAGE_OFFSET;
480
481	/* NOTE: this algorithm assumes efi memmap table is ordered */
482
483	if (*max_gap < (start - last_end))
484	*max_gap = start - last_end;
485	last_end = end;
486	return 0;
487	}
488	#endif /* CONFIG_VIRTUAL_MEM_MAP */
489
490	static int
491	count_reserved_pages (u64 start, u64 end, void *arg)
492	{
493	unsigned long num_reserved = 0;
494	unsigned long *count = arg;
495
496	for (; start < end; start += PAGE_SIZE)
497	if (PageReserved(virt_to_page(start)))
498	++num_reserved;
499	*count += num_reserved;
500	return 0;
501	}
502
503	/*
504	* Boot command-line option "nolwsys" can be used to disable the use of any light-weight
505	* system call handler. When this option is in effect, all fsyscalls will end up bubbling
506	* down into the kernel and calling the normal (heavy-weight) syscall handler. This is
507	* useful for performance testing, but conceivably could also come in handy for debugging
508	* purposes.
509	*/
510
511	static int nolwsys;
512
513	static int __init
514	nolwsys_setup (char *s)
515	{
516	nolwsys = 1;
517	return 1;
518	}
519
520	__setup("nolwsys", nolwsys_setup);
521
522	void
523	mem_init (void)
524	{
525	long reserved_pages, codesize, datasize, initsize;
526	unsigned long num_pgt_pages;
527	pg_data_t *pgdat;
528	int i;
529	static struct kcore_list kcore_mem, kcore_vmem, kcore_kernel;
530
531	#ifdef CONFIG_PCI
532	/*
533	* This needs to be called _after_ the command line has been parsed but _before_
534	* any drivers that may need the PCI DMA interface are initialized or bootmem has
535	* been freed.
536	*/
537	platform_dma_init();
538	#endif
539
540	#ifndef CONFIG_DISCONTIGMEM
541	if (!mem_map)
542	BUG();
543	max_mapnr = max_low_pfn;
544	#endif
545
546	high_memory = __va(max_low_pfn * PAGE_SIZE);
547
548	kclist_add(&kcore_mem, __va(0), max_low_pfn * PAGE_SIZE);
549	kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
550	kclist_add(&kcore_kernel, _stext, _end - _stext);
551
552	for_each_pgdat(pgdat)
553	totalram_pages += free_all_bootmem_node(pgdat);
554
555	reserved_pages = 0;
556	efi_memmap_walk(count_reserved_pages, &reserved_pages);
557
558	codesize = (unsigned long) _etext - (unsigned long) _stext;
559	datasize = (unsigned long) _edata - (unsigned long) _etext;
560	initsize = (unsigned long) __init_end - (unsigned long) __init_begin;
561
562	printk(KERN_INFO "Memory: %luk/%luk available (%luk code, %luk reserved, "
563	"%luk data, %luk init)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT - 10),
564	num_physpages << (PAGE_SHIFT - 10), codesize >> 10,
565	reserved_pages << (PAGE_SHIFT - 10), datasize >> 10, initsize >> 10);
566
567	/*
568	* Allow for enough (cached) page table pages so that we can map the entire memory
569	* at least once. Each task also needs a couple of page tables pages, so add in a
570	* fudge factor for that (don't use "threads-max" here; that would be wrong!).
571	* Don't allow the cache to be more than 10% of total memory, though.
572	*/
573	# define NUM_TASKS 500 /* typical number of tasks */
574	num_pgt_pages = nr_free_pages() / PTRS_PER_PGD + NUM_TASKS;
575	if (num_pgt_pages > nr_free_pages() / 10)
576	num_pgt_pages = nr_free_pages() / 10;
577	if (num_pgt_pages > (u64) pgt_cache_water[1])
578	pgt_cache_water[1] = num_pgt_pages;
579
580	/*
581	* For fsyscall entrpoints with no light-weight handler, use the ordinary
582	* (heavy-weight) handler, but mark it by setting bit 0, so the fsyscall entry
583	* code can tell them apart.
584	*/
585	for (i = 0; i < NR_syscalls; ++i) {
586	extern unsigned long fsyscall_table[NR_syscalls];
587	extern unsigned long sys_call_table[NR_syscalls];
588
589	if (!fsyscall_table[i] \|\| nolwsys)
590	fsyscall_table[i] = sys_call_table[i] \| 1;
591	}
592	setup_gate();
593
594	#ifdef CONFIG_IA32_SUPPORT
595	ia32_mem_init();
596	#endif
597	}