x86: nuke a ton of unused exports
[deliverable/linux.git] / arch / x86 / mm / numa_64.c
1 /*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5 #include <linux/kernel.h>
6 #include <linux/mm.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
14
15 #include <asm/e820.h>
16 #include <asm/proto.h>
17 #include <asm/dma.h>
18 #include <asm/numa.h>
19 #include <asm/acpi.h>
20 #include <asm/k8.h>
21
22 #ifndef Dprintk
23 #define Dprintk(x...)
24 #endif
25
26 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
27 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
28
29 struct memnode memnode;
30
31 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
32 [0 ... NR_CPUS-1] = NUMA_NO_NODE
33 };
34 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36 };
37 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
38
39 int numa_off __initdata;
40 unsigned long __initdata nodemap_addr;
41 unsigned long __initdata nodemap_size;
42
43
44 /*
45 * Given a shift value, try to populate memnodemap[]
46 * Returns :
47 * 1 if OK
48 * 0 if memnodmap[] too small (of shift too small)
49 * -1 if node overlap or lost ram (shift too big)
50 */
51 static int __init
52 populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
53 {
54 int i;
55 int res = -1;
56 unsigned long addr, end;
57
58 memset(memnodemap, 0xff, memnodemapsize);
59 for (i = 0; i < numnodes; i++) {
60 addr = nodes[i].start;
61 end = nodes[i].end;
62 if (addr >= end)
63 continue;
64 if ((end >> shift) >= memnodemapsize)
65 return 0;
66 do {
67 if (memnodemap[addr >> shift] != 0xff)
68 return -1;
69 memnodemap[addr >> shift] = i;
70 addr += (1UL << shift);
71 } while (addr < end);
72 res = 1;
73 }
74 return res;
75 }
76
77 static int __init allocate_cachealigned_memnodemap(void)
78 {
79 unsigned long pad, pad_addr;
80
81 memnodemap = memnode.embedded_map;
82 if (memnodemapsize <= 48)
83 return 0;
84
85 pad = L1_CACHE_BYTES - 1;
86 pad_addr = 0x8000;
87 nodemap_size = pad + memnodemapsize;
88 nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
89 nodemap_size);
90 if (nodemap_addr == -1UL) {
91 printk(KERN_ERR
92 "NUMA: Unable to allocate Memory to Node hash map\n");
93 nodemap_addr = nodemap_size = 0;
94 return -1;
95 }
96 pad_addr = (nodemap_addr + pad) & ~pad;
97 memnodemap = phys_to_virt(pad_addr);
98
99 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
100 nodemap_addr, nodemap_addr + nodemap_size);
101 return 0;
102 }
103
104 /*
105 * The LSB of all start and end addresses in the node map is the value of the
106 * maximum possible shift.
107 */
108 static int __init
109 extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
110 {
111 int i, nodes_used = 0;
112 unsigned long start, end;
113 unsigned long bitfield = 0, memtop = 0;
114
115 for (i = 0; i < numnodes; i++) {
116 start = nodes[i].start;
117 end = nodes[i].end;
118 if (start >= end)
119 continue;
120 bitfield |= start;
121 nodes_used++;
122 if (end > memtop)
123 memtop = end;
124 }
125 if (nodes_used <= 1)
126 i = 63;
127 else
128 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
129 memnodemapsize = (memtop >> i)+1;
130 return i;
131 }
132
133 int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
134 {
135 int shift;
136
137 shift = extract_lsb_from_nodes(nodes, numnodes);
138 if (allocate_cachealigned_memnodemap())
139 return -1;
140 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
141 shift);
142
143 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
144 printk(KERN_INFO
145 "Your memory is not aligned you need to rebuild your kernel "
146 "with a bigger NODEMAPSIZE shift=%d\n",
147 shift);
148 return -1;
149 }
150 return shift;
151 }
152
153 #ifdef CONFIG_SPARSEMEM
154 int early_pfn_to_nid(unsigned long pfn)
155 {
156 return phys_to_nid(pfn << PAGE_SHIFT);
157 }
158 #endif
159
160 static void * __init
161 early_node_mem(int nodeid, unsigned long start, unsigned long end,
162 unsigned long size)
163 {
164 unsigned long mem = find_e820_area(start, end, size);
165 void *ptr;
166 if (mem != -1L)
167 return __va(mem);
168 ptr = __alloc_bootmem_nopanic(size,
169 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
170 if (ptr == NULL) {
171 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
172 size, nodeid);
173 return NULL;
174 }
175 return ptr;
176 }
177
178 /* Initialize bootmem allocator for a node */
179 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
180 {
181 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
182 unsigned long nodedata_phys;
183 void *bootmap;
184 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
185
186 start = round_up(start, ZONE_ALIGN);
187
188 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
189
190 start_pfn = start >> PAGE_SHIFT;
191 end_pfn = end >> PAGE_SHIFT;
192
193 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
194 if (node_data[nodeid] == NULL)
195 return;
196 nodedata_phys = __pa(node_data[nodeid]);
197
198 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
199 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
200 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
201 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
202
203 /* Find a place for the bootmem map */
204 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
205 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
206 bootmap = early_node_mem(nodeid, bootmap_start, end,
207 bootmap_pages<<PAGE_SHIFT);
208 if (bootmap == NULL) {
209 if (nodedata_phys < start || nodedata_phys >= end)
210 free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
211 node_data[nodeid] = NULL;
212 return;
213 }
214 bootmap_start = __pa(bootmap);
215 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
216
217 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
218 bootmap_start >> PAGE_SHIFT,
219 start_pfn, end_pfn);
220
221 free_bootmem_with_active_regions(nodeid, end);
222
223 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
224 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
225 #ifdef CONFIG_ACPI_NUMA
226 srat_reserve_add_area(nodeid);
227 #endif
228 node_set_online(nodeid);
229 }
230
231 /* Initialize final allocator for a zone */
232 void __init setup_node_zones(int nodeid)
233 {
234 unsigned long start_pfn, end_pfn, memmapsize, limit;
235
236 start_pfn = node_start_pfn(nodeid);
237 end_pfn = node_end_pfn(nodeid);
238
239 Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
240 nodeid, start_pfn, end_pfn);
241
242 /* Try to allocate mem_map at end to not fill up precious <4GB
243 memory. */
244 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
245 limit = end_pfn << PAGE_SHIFT;
246 #ifdef CONFIG_FLAT_NODE_MEM_MAP
247 NODE_DATA(nodeid)->node_mem_map =
248 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
249 memmapsize, SMP_CACHE_BYTES,
250 round_down(limit - memmapsize, PAGE_SIZE),
251 limit);
252 #endif
253 }
254
255 void __init numa_init_array(void)
256 {
257 int rr, i;
258 /* There are unfortunately some poorly designed mainboards around
259 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
260 mapping. To avoid this fill in the mapping for all possible
261 CPUs, as the number of CPUs is not known yet.
262 We round robin the existing nodes. */
263 rr = first_node(node_online_map);
264 for (i = 0; i < NR_CPUS; i++) {
265 if (cpu_to_node(i) != NUMA_NO_NODE)
266 continue;
267 numa_set_node(i, rr);
268 rr = next_node(rr, node_online_map);
269 if (rr == MAX_NUMNODES)
270 rr = first_node(node_online_map);
271 }
272
273 }
274
275 #ifdef CONFIG_NUMA_EMU
276 /* Numa emulation */
277 char *cmdline __initdata;
278
279 /*
280 * Setups up nid to range from addr to addr + size. If the end boundary is
281 * greater than max_addr, then max_addr is used instead. The return value is 0
282 * if there is additional memory left for allocation past addr and -1 otherwise.
283 * addr is adjusted to be at the end of the node.
284 */
285 static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
286 u64 size, u64 max_addr)
287 {
288 int ret = 0;
289 nodes[nid].start = *addr;
290 *addr += size;
291 if (*addr >= max_addr) {
292 *addr = max_addr;
293 ret = -1;
294 }
295 nodes[nid].end = *addr;
296 node_set(nid, node_possible_map);
297 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
298 nodes[nid].start, nodes[nid].end,
299 (nodes[nid].end - nodes[nid].start) >> 20);
300 return ret;
301 }
302
303 /*
304 * Splits num_nodes nodes up equally starting at node_start. The return value
305 * is the number of nodes split up and addr is adjusted to be at the end of the
306 * last node allocated.
307 */
308 static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
309 u64 max_addr, int node_start,
310 int num_nodes)
311 {
312 unsigned int big;
313 u64 size;
314 int i;
315
316 if (num_nodes <= 0)
317 return -1;
318 if (num_nodes > MAX_NUMNODES)
319 num_nodes = MAX_NUMNODES;
320 size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
321 num_nodes;
322 /*
323 * Calculate the number of big nodes that can be allocated as a result
324 * of consolidating the leftovers.
325 */
326 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
327 FAKE_NODE_MIN_SIZE;
328
329 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
330 size &= FAKE_NODE_MIN_HASH_MASK;
331 if (!size) {
332 printk(KERN_ERR "Not enough memory for each node. "
333 "NUMA emulation disabled.\n");
334 return -1;
335 }
336
337 for (i = node_start; i < num_nodes + node_start; i++) {
338 u64 end = *addr + size;
339 if (i < big)
340 end += FAKE_NODE_MIN_SIZE;
341 /*
342 * The final node can have the remaining system RAM. Other
343 * nodes receive roughly the same amount of available pages.
344 */
345 if (i == num_nodes + node_start - 1)
346 end = max_addr;
347 else
348 while (end - *addr - e820_hole_size(*addr, end) <
349 size) {
350 end += FAKE_NODE_MIN_SIZE;
351 if (end > max_addr) {
352 end = max_addr;
353 break;
354 }
355 }
356 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
357 break;
358 }
359 return i - node_start + 1;
360 }
361
362 /*
363 * Splits the remaining system RAM into chunks of size. The remaining memory is
364 * always assigned to a final node and can be asymmetric. Returns the number of
365 * nodes split.
366 */
367 static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
368 u64 max_addr, int node_start, u64 size)
369 {
370 int i = node_start;
371 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
372 while (!setup_node_range(i++, nodes, addr, size, max_addr))
373 ;
374 return i - node_start;
375 }
376
377 /*
378 * Sets up the system RAM area from start_pfn to end_pfn according to the
379 * numa=fake command-line option.
380 */
381 static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
382 {
383 struct bootnode nodes[MAX_NUMNODES];
384 u64 addr = start_pfn << PAGE_SHIFT;
385 u64 max_addr = end_pfn << PAGE_SHIFT;
386 int num_nodes = 0;
387 int coeff_flag;
388 int coeff = -1;
389 int num = 0;
390 u64 size;
391 int i;
392
393 memset(&nodes, 0, sizeof(nodes));
394 /*
395 * If the numa=fake command-line is just a single number N, split the
396 * system RAM into N fake nodes.
397 */
398 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
399 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
400 simple_strtol(cmdline, NULL, 0));
401 if (num_nodes < 0)
402 return num_nodes;
403 goto out;
404 }
405
406 /* Parse the command line. */
407 for (coeff_flag = 0; ; cmdline++) {
408 if (*cmdline && isdigit(*cmdline)) {
409 num = num * 10 + *cmdline - '0';
410 continue;
411 }
412 if (*cmdline == '*') {
413 if (num > 0)
414 coeff = num;
415 coeff_flag = 1;
416 }
417 if (!*cmdline || *cmdline == ',') {
418 if (!coeff_flag)
419 coeff = 1;
420 /*
421 * Round down to the nearest FAKE_NODE_MIN_SIZE.
422 * Command-line coefficients are in megabytes.
423 */
424 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
425 if (size)
426 for (i = 0; i < coeff; i++, num_nodes++)
427 if (setup_node_range(num_nodes, nodes,
428 &addr, size, max_addr) < 0)
429 goto done;
430 if (!*cmdline)
431 break;
432 coeff_flag = 0;
433 coeff = -1;
434 }
435 num = 0;
436 }
437 done:
438 if (!num_nodes)
439 return -1;
440 /* Fill remainder of system RAM, if appropriate. */
441 if (addr < max_addr) {
442 if (coeff_flag && coeff < 0) {
443 /* Split remaining nodes into num-sized chunks */
444 num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
445 num_nodes, num);
446 goto out;
447 }
448 switch (*(cmdline - 1)) {
449 case '*':
450 /* Split remaining nodes into coeff chunks */
451 if (coeff <= 0)
452 break;
453 num_nodes += split_nodes_equally(nodes, &addr, max_addr,
454 num_nodes, coeff);
455 break;
456 case ',':
457 /* Do not allocate remaining system RAM */
458 break;
459 default:
460 /* Give one final node */
461 setup_node_range(num_nodes, nodes, &addr,
462 max_addr - addr, max_addr);
463 num_nodes++;
464 }
465 }
466 out:
467 memnode_shift = compute_hash_shift(nodes, num_nodes);
468 if (memnode_shift < 0) {
469 memnode_shift = 0;
470 printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
471 "disabled.\n");
472 return -1;
473 }
474
475 /*
476 * We need to vacate all active ranges that may have been registered by
477 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
478 * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
479 */
480 remove_all_active_ranges();
481 #ifdef CONFIG_ACPI_NUMA
482 acpi_numa = -1;
483 #endif
484 for_each_node_mask(i, node_possible_map) {
485 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
486 nodes[i].end >> PAGE_SHIFT);
487 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
488 }
489 acpi_fake_nodes(nodes, num_nodes);
490 numa_init_array();
491 return 0;
492 }
493 #endif /* CONFIG_NUMA_EMU */
494
495 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
496 {
497 int i;
498
499 nodes_clear(node_possible_map);
500
501 #ifdef CONFIG_NUMA_EMU
502 if (cmdline && !numa_emulation(start_pfn, end_pfn))
503 return;
504 nodes_clear(node_possible_map);
505 #endif
506
507 #ifdef CONFIG_ACPI_NUMA
508 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
509 end_pfn << PAGE_SHIFT))
510 return;
511 nodes_clear(node_possible_map);
512 #endif
513
514 #ifdef CONFIG_K8_NUMA
515 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
516 return;
517 nodes_clear(node_possible_map);
518 #endif
519 printk(KERN_INFO "%s\n",
520 numa_off ? "NUMA turned off" : "No NUMA configuration found");
521
522 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
523 start_pfn << PAGE_SHIFT,
524 end_pfn << PAGE_SHIFT);
525 /* setup dummy node covering all memory */
526 memnode_shift = 63;
527 memnodemap = memnode.embedded_map;
528 memnodemap[0] = 0;
529 nodes_clear(node_online_map);
530 node_set_online(0);
531 node_set(0, node_possible_map);
532 for (i = 0; i < NR_CPUS; i++)
533 numa_set_node(i, 0);
534 node_to_cpumask[0] = cpumask_of_cpu(0);
535 e820_register_active_regions(0, start_pfn, end_pfn);
536 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
537 }
538
539 __cpuinit void numa_add_cpu(int cpu)
540 {
541 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
542 }
543
544 void __cpuinit numa_set_node(int cpu, int node)
545 {
546 cpu_pda(cpu)->nodenumber = node;
547 cpu_to_node(cpu) = node;
548 }
549
550 unsigned long __init numa_free_all_bootmem(void)
551 {
552 int i;
553 unsigned long pages = 0;
554 for_each_online_node(i) {
555 pages += free_all_bootmem_node(NODE_DATA(i));
556 }
557 return pages;
558 }
559
560 void __init paging_init(void)
561 {
562 int i;
563 unsigned long max_zone_pfns[MAX_NR_ZONES];
564 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
565 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
566 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
567 max_zone_pfns[ZONE_NORMAL] = end_pfn;
568
569 sparse_memory_present_with_active_regions(MAX_NUMNODES);
570 sparse_init();
571
572 for_each_online_node(i) {
573 setup_node_zones(i);
574 }
575
576 free_area_init_nodes(max_zone_pfns);
577 }
578
579 static __init int numa_setup(char *opt)
580 {
581 if (!opt)
582 return -EINVAL;
583 if (!strncmp(opt,"off",3))
584 numa_off = 1;
585 #ifdef CONFIG_NUMA_EMU
586 if (!strncmp(opt, "fake=", 5))
587 cmdline = opt + 5;
588 #endif
589 #ifdef CONFIG_ACPI_NUMA
590 if (!strncmp(opt,"noacpi",6))
591 acpi_numa = -1;
592 if (!strncmp(opt,"hotadd=", 7))
593 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
594 #endif
595 return 0;
596 }
597
598 early_param("numa", numa_setup);
599
600 /*
601 * Setup early cpu_to_node.
602 *
603 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
604 * and apicid_to_node[] tables have valid entries for a CPU.
605 * This means we skip cpu_to_node[] initialisation for NUMA
606 * emulation and faking node case (when running a kernel compiled
607 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
608 * is already initialized in a round robin manner at numa_init_array,
609 * prior to this call, and this initialization is good enough
610 * for the fake NUMA cases.
611 */
612 void __init init_cpu_to_node(void)
613 {
614 int i;
615 for (i = 0; i < NR_CPUS; i++) {
616 u8 apicid = x86_cpu_to_apicid_init[i];
617 if (apicid == BAD_APICID)
618 continue;
619 if (apicid_to_node[apicid] == NUMA_NO_NODE)
620 continue;
621 numa_set_node(i,apicid_to_node[apicid]);
622 }
623 }
624
625 EXPORT_SYMBOL(cpu_to_node);
626 EXPORT_SYMBOL(node_to_cpumask);
627 EXPORT_SYMBOL(node_data);
628
629 #ifdef CONFIG_DISCONTIGMEM
630 /*
631 * Functions to convert PFNs from/to per node page addresses.
632 * These are out of line because they are quite big.
633 * They could be all tuned by pre caching more state.
634 * Should do that.
635 */
636
637 int pfn_valid(unsigned long pfn)
638 {
639 unsigned nid;
640 if (pfn >= num_physpages)
641 return 0;
642 nid = pfn_to_nid(pfn);
643 if (nid == 0xff)
644 return 0;
645 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
646 }
647 EXPORT_SYMBOL(pfn_valid);
648 #endif
This page took 0.047496 seconds and 6 git commands to generate.