[PATCH] x86_64: Try to allocate node memmap near the end of node
[deliverable/linux.git] / arch / x86_64 / mm / numa.c
CommitLineData
1da177e4
LT
1/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
6c231b7b 25struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
1da177e4
LT
26bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28int memnode_shift;
29u8 memnodemap[NODEMAPSIZE];
30
3f098c26
AK
31unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
32 [0 ... NR_CPUS-1] = NUMA_NO_NODE
0b07e984 33};
3f098c26
AK
34unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36};
37cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
1da177e4
LT
38
39int numa_off __initdata;
40
529a3404
ED
41
42/*
43 * Given a shift value, try to populate memnodemap[]
44 * Returns :
45 * 1 if OK
46 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big)
48 */
d18ff470 49static int __init
abe059e7 50populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
1da177e4
LT
51{
52 int i;
529a3404
ED
53 int res = -1;
54 unsigned long addr, end;
b684664f 55
8309cf66
ED
56 if (shift >= 64)
57 return -1;
529a3404 58 memset(memnodemap, 0xff, sizeof(memnodemap));
b684664f 59 for (i = 0; i < numnodes; i++) {
529a3404
ED
60 addr = nodes[i].start;
61 end = nodes[i].end;
62 if (addr >= end)
b684664f 63 continue;
529a3404
ED
64 if ((end >> shift) >= NODEMAPSIZE)
65 return 0;
66 do {
67 if (memnodemap[addr >> shift] != 0xff)
b684664f 68 return -1;
b684664f 69 memnodemap[addr >> shift] = i;
8309cf66 70 addr += (1UL << shift);
529a3404
ED
71 } while (addr < end);
72 res = 1;
1da177e4 73 }
529a3404
ED
74 return res;
75}
76
abe059e7 77int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
529a3404
ED
78{
79 int shift = 20;
80
81 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
82 shift++;
83
6b050f80 84 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
529a3404
ED
85 shift);
86
87 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
88 printk(KERN_INFO
89 "Your memory is not aligned you need to rebuild your kernel "
90 "with a bigger NODEMAPSIZE shift=%d\n",
91 shift);
92 return -1;
93 }
b684664f 94 return shift;
1da177e4
LT
95}
96
bbfceef4
MT
97#ifdef CONFIG_SPARSEMEM
98int early_pfn_to_nid(unsigned long pfn)
99{
100 return phys_to_nid(pfn << PAGE_SHIFT);
101}
102#endif
103
1da177e4
LT
104/* Initialize bootmem allocator for a node */
105void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
106{
107 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
108 unsigned long nodedata_phys;
109 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
110
111 start = round_up(start, ZONE_ALIGN);
112
6b050f80 113 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
1da177e4
LT
114
115 start_pfn = start >> PAGE_SHIFT;
116 end_pfn = end >> PAGE_SHIFT;
117
118 nodedata_phys = find_e820_area(start, end, pgdat_size);
119 if (nodedata_phys == -1L)
120 panic("Cannot find memory pgdat in node %d\n", nodeid);
121
122 Dprintk("nodedata_phys %lx\n", nodedata_phys);
123
124 node_data[nodeid] = phys_to_virt(nodedata_phys);
125 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
126 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
127 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
128 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
129
130 /* Find a place for the bootmem map */
131 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
132 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
133 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
134 if (bootmap_start == -1L)
135 panic("Not enough continuous space for bootmap on node %d", nodeid);
136 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
137
138 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
139 bootmap_start >> PAGE_SHIFT,
140 start_pfn, end_pfn);
141
142 e820_bootmem_free(NODE_DATA(nodeid), start, end);
143
144 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
145 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
146 node_set_online(nodeid);
147}
148
149/* Initialize final allocator for a zone */
150void __init setup_node_zones(int nodeid)
151{
267b4801 152 unsigned long start_pfn, end_pfn, memmapsize, limit;
1da177e4 153 unsigned long zones[MAX_NR_ZONES];
485761bd 154 unsigned long holes[MAX_NR_ZONES];
1da177e4 155
a2f1b424
AK
156 start_pfn = node_start_pfn(nodeid);
157 end_pfn = node_end_pfn(nodeid);
1da177e4 158
6b050f80 159 Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
a2f1b424 160 nodeid, start_pfn, end_pfn);
1da177e4 161
267b4801
AK
162 /* Try to allocate mem_map at end to not fill up precious <4GB
163 memory. */
164 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
165 limit = end_pfn << PAGE_SHIFT;
166 NODE_DATA(nodeid)->node_mem_map =
167 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
168 memmapsize, SMP_CACHE_BYTES,
169 round_down(limit - memmapsize, PAGE_SIZE),
170 limit);
171
a2f1b424 172 size_zones(zones, holes, start_pfn, end_pfn);
1da177e4 173 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
485761bd 174 start_pfn, holes);
1da177e4
LT
175}
176
177void __init numa_init_array(void)
178{
179 int rr, i;
180 /* There are unfortunately some poorly designed mainboards around
181 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
182 mapping. To avoid this fill in the mapping for all possible
183 CPUs, as the number of CPUs is not known yet.
184 We round robin the existing nodes. */
85cc5135 185 rr = first_node(node_online_map);
1da177e4
LT
186 for (i = 0; i < NR_CPUS; i++) {
187 if (cpu_to_node[i] != NUMA_NO_NODE)
188 continue;
69d81fcd 189 numa_set_node(i, rr);
1da177e4
LT
190 rr = next_node(rr, node_online_map);
191 if (rr == MAX_NUMNODES)
192 rr = first_node(node_online_map);
1da177e4
LT
193 }
194
1da177e4
LT
195}
196
197#ifdef CONFIG_NUMA_EMU
198int numa_fake __initdata = 0;
199
200/* Numa emulation */
201static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
202{
203 int i;
abe059e7 204 struct bootnode nodes[MAX_NUMNODES];
1da177e4
LT
205 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
206
207 /* Kludge needed for the hash function */
208 if (hweight64(sz) > 1) {
209 unsigned long x = 1;
210 while ((x << 1) < sz)
211 x <<= 1;
212 if (x < sz/2)
6b050f80 213 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
1da177e4
LT
214 sz = x;
215 }
216
217 memset(&nodes,0,sizeof(nodes));
218 for (i = 0; i < numa_fake; i++) {
219 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
220 if (i == numa_fake-1)
221 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
222 nodes[i].end = nodes[i].start + sz;
1da177e4
LT
223 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
224 i,
225 nodes[i].start, nodes[i].end,
226 (nodes[i].end - nodes[i].start) >> 20);
227 node_set_online(i);
228 }
229 memnode_shift = compute_hash_shift(nodes, numa_fake);
230 if (memnode_shift < 0) {
231 memnode_shift = 0;
232 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
233 return -1;
234 }
235 for_each_online_node(i)
236 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
237 numa_init_array();
238 return 0;
239}
240#endif
241
242void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
243{
244 int i;
245
246#ifdef CONFIG_NUMA_EMU
247 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
248 return;
249#endif
250
251#ifdef CONFIG_ACPI_NUMA
252 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
253 end_pfn << PAGE_SHIFT))
254 return;
255#endif
256
257#ifdef CONFIG_K8_NUMA
258 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
259 return;
260#endif
261 printk(KERN_INFO "%s\n",
262 numa_off ? "NUMA turned off" : "No NUMA configuration found");
263
264 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
265 start_pfn << PAGE_SHIFT,
266 end_pfn << PAGE_SHIFT);
267 /* setup dummy node covering all memory */
268 memnode_shift = 63;
269 memnodemap[0] = 0;
270 nodes_clear(node_online_map);
271 node_set_online(0);
272 for (i = 0; i < NR_CPUS; i++)
69d81fcd 273 numa_set_node(i, 0);
1da177e4
LT
274 node_to_cpumask[0] = cpumask_of_cpu(0);
275 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
276}
277
e6982c67 278__cpuinit void numa_add_cpu(int cpu)
1da177e4 279{
e6a045a5 280 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
1da177e4
LT
281}
282
69d81fcd
AK
283void __cpuinit numa_set_node(int cpu, int node)
284{
df79efde 285 cpu_pda(cpu)->nodenumber = node;
69d81fcd
AK
286 cpu_to_node[cpu] = node;
287}
288
1da177e4
LT
289unsigned long __init numa_free_all_bootmem(void)
290{
291 int i;
292 unsigned long pages = 0;
293 for_each_online_node(i) {
294 pages += free_all_bootmem_node(NODE_DATA(i));
295 }
296 return pages;
297}
298
d3ee871e
BP
299#ifdef CONFIG_SPARSEMEM
300static void __init arch_sparse_init(void)
301{
302 int i;
303
304 for_each_online_node(i)
305 memory_present(i, node_start_pfn(i), node_end_pfn(i));
306
307 sparse_init();
308}
309#else
310#define arch_sparse_init() do {} while (0)
311#endif
312
1da177e4
LT
313void __init paging_init(void)
314{
315 int i;
d3ee871e
BP
316
317 arch_sparse_init();
318
1da177e4
LT
319 for_each_online_node(i) {
320 setup_node_zones(i);
321 }
322}
323
324/* [numa=off] */
325__init int numa_setup(char *opt)
326{
327 if (!strncmp(opt,"off",3))
328 numa_off = 1;
329#ifdef CONFIG_NUMA_EMU
330 if(!strncmp(opt, "fake=", 5)) {
331 numa_fake = simple_strtoul(opt+5,NULL,0); ;
332 if (numa_fake >= MAX_NUMNODES)
333 numa_fake = MAX_NUMNODES;
334 }
335#endif
336#ifdef CONFIG_ACPI_NUMA
337 if (!strncmp(opt,"noacpi",6))
338 acpi_numa = -1;
339#endif
340 return 1;
341}
342
05b3cbd8
RT
343/*
344 * Setup early cpu_to_node.
345 *
346 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
347 * and apicid_to_node[] tables have valid entries for a CPU.
348 * This means we skip cpu_to_node[] initialisation for NUMA
349 * emulation and faking node case (when running a kernel compiled
350 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
351 * is already initialized in a round robin manner at numa_init_array,
352 * prior to this call, and this initialization is good enough
353 * for the fake NUMA cases.
354 */
355void __init init_cpu_to_node(void)
356{
357 int i;
358 for (i = 0; i < NR_CPUS; i++) {
359 u8 apicid = x86_cpu_to_apicid[i];
360 if (apicid == BAD_APICID)
361 continue;
362 if (apicid_to_node[apicid] == NUMA_NO_NODE)
363 continue;
d1db4ec8 364 numa_set_node(i,apicid_to_node[apicid]);
05b3cbd8
RT
365 }
366}
367
1da177e4
LT
368EXPORT_SYMBOL(cpu_to_node);
369EXPORT_SYMBOL(node_to_cpumask);
370EXPORT_SYMBOL(memnode_shift);
371EXPORT_SYMBOL(memnodemap);
372EXPORT_SYMBOL(node_data);
cf050132
AK
373
374#ifdef CONFIG_DISCONTIGMEM
375/*
376 * Functions to convert PFNs from/to per node page addresses.
377 * These are out of line because they are quite big.
378 * They could be all tuned by pre caching more state.
379 * Should do that.
380 */
381
382/* Requires pfn_valid(pfn) to be true */
383struct page *pfn_to_page(unsigned long pfn)
384{
385 int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT);
386 return (pfn - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map;
387}
388EXPORT_SYMBOL(pfn_to_page);
389
390unsigned long page_to_pfn(struct page *page)
391{
392 return (long)(((page) - page_zone(page)->zone_mem_map) +
393 page_zone(page)->zone_start_pfn);
394}
395EXPORT_SYMBOL(page_to_pfn);
396
397int pfn_valid(unsigned long pfn)
398{
399 unsigned nid;
400 if (pfn >= num_physpages)
401 return 0;
402 nid = pfn_to_nid(pfn);
403 if (nid == 0xff)
404 return 0;
405 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
406}
407EXPORT_SYMBOL(pfn_valid);
408#endif
This page took 0.136289 seconds and 5 git commands to generate.