[PATCH] Have x86 use add_active_range() and free_area_init_nodes
[deliverable/linux.git] / arch / x86_64 / mm / srat.c
CommitLineData
1da177e4
LT
1/*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 */
11
12#include <linux/kernel.h>
13#include <linux/acpi.h>
14#include <linux/mmzone.h>
15#include <linux/bitmap.h>
16#include <linux/module.h>
17#include <linux/topology.h>
68a3a7fe
AK
18#include <linux/bootmem.h>
19#include <linux/mm.h>
1da177e4
LT
20#include <asm/proto.h>
21#include <asm/numa.h>
8a6fdd3e 22#include <asm/e820.h>
1da177e4 23
c31fbb1a
AK
24int acpi_numa __initdata;
25
68a3a7fe
AK
26#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
27 defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
28 && !defined(CONFIG_MEMORY_HOTPLUG)
29#define RESERVE_HOTADD 1
30#endif
31
1da177e4
LT
32static struct acpi_table_slit *acpi_slit;
33
34static nodemask_t nodes_parsed __initdata;
abe059e7 35static struct bootnode nodes[MAX_NUMNODES] __initdata;
68a3a7fe
AK
36static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
37static int found_add_area __initdata;
fad7906d
AK
38int hotadd_percent __initdata = 0;
39#ifndef RESERVE_HOTADD
40#define hotadd_percent 0 /* Ignore all settings */
41#endif
1da177e4 42
9391a3f9
AK
43/* Too small nodes confuse the VM badly. Usually they result
44 from BIOS bugs. */
45#define NODE_MIN_SIZE (4*1024*1024)
46
1da177e4
LT
47static __init int setup_node(int pxm)
48{
762834e8 49 return acpi_map_pxm_to_node(pxm);
1da177e4
LT
50}
51
52static __init int conflicting_nodes(unsigned long start, unsigned long end)
53{
54 int i;
4b6a455c 55 for_each_node_mask(i, nodes_parsed) {
abe059e7 56 struct bootnode *nd = &nodes[i];
1da177e4
LT
57 if (nd->start == nd->end)
58 continue;
59 if (nd->end > start && nd->start < end)
05d1fa4b 60 return i;
1da177e4 61 if (nd->end == end && nd->start == start)
05d1fa4b 62 return i;
1da177e4
LT
63 }
64 return -1;
65}
66
67static __init void cutoff_node(int i, unsigned long start, unsigned long end)
68{
abe059e7 69 struct bootnode *nd = &nodes[i];
68a3a7fe
AK
70
71 if (found_add_area)
72 return;
73
1da177e4
LT
74 if (nd->start < start) {
75 nd->start = start;
76 if (nd->end < nd->start)
77 nd->start = nd->end;
78 }
79 if (nd->end > end) {
1da177e4
LT
80 nd->end = end;
81 if (nd->start > nd->end)
82 nd->start = nd->end;
83 }
84}
85
86static __init void bad_srat(void)
87{
2bce2b54 88 int i;
1da177e4
LT
89 printk(KERN_ERR "SRAT: SRAT not used.\n");
90 acpi_numa = -1;
fad7906d 91 found_add_area = 0;
2bce2b54
AK
92 for (i = 0; i < MAX_LOCAL_APIC; i++)
93 apicid_to_node[i] = NUMA_NO_NODE;
68a3a7fe
AK
94 for (i = 0; i < MAX_NUMNODES; i++)
95 nodes_add[i].start = nodes[i].end = 0;
1da177e4
LT
96}
97
98static __init inline int srat_disabled(void)
99{
100 return numa_off || acpi_numa < 0;
101}
102
1584b89c
AK
103/*
104 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
105 * up the NUMA heuristics which wants the local node to have a smaller
106 * distance than the others.
107 * Do some quick checks here and only use the SLIT if it passes.
108 */
109static __init int slit_valid(struct acpi_table_slit *slit)
110{
111 int i, j;
112 int d = slit->localities;
113 for (i = 0; i < d; i++) {
114 for (j = 0; j < d; j++) {
115 u8 val = slit->entry[d*i + j];
116 if (i == j) {
117 if (val != 10)
118 return 0;
119 } else if (val <= 10)
120 return 0;
121 }
122 }
123 return 1;
124}
125
1da177e4
LT
126/* Callback for SLIT parsing */
127void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
128{
1584b89c
AK
129 if (!slit_valid(slit)) {
130 printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
131 return;
132 }
1da177e4
LT
133 acpi_slit = slit;
134}
135
136/* Callback for Proximity Domain -> LAPIC mapping */
137void __init
138acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
139{
140 int pxm, node;
d22fe808
AK
141 if (srat_disabled())
142 return;
fad7906d
AK
143 if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) {
144 bad_srat();
d22fe808
AK
145 return;
146 }
147 if (pa->flags.enabled == 0)
1da177e4
LT
148 return;
149 pxm = pa->proximity_domain;
150 node = setup_node(pxm);
151 if (node < 0) {
152 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
153 bad_srat();
154 return;
155 }
0b07e984 156 apicid_to_node[pa->apic_id] = node;
1da177e4 157 acpi_numa = 1;
0b07e984
AK
158 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
159 pxm, pa->apic_id, node);
1da177e4
LT
160}
161
68a3a7fe
AK
162#ifdef RESERVE_HOTADD
163/*
164 * Protect against too large hotadd areas that would fill up memory.
165 */
166static int hotadd_enough_memory(struct bootnode *nd)
167{
168 static unsigned long allocated;
169 static unsigned long last_area_end;
170 unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
171 long mem = pages * sizeof(struct page);
172 unsigned long addr;
173 unsigned long allowed;
174 unsigned long oldpages = pages;
175
176 if (mem < 0)
177 return 0;
178 allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
179 allowed = (allowed / 100) * hotadd_percent;
180 if (allocated + mem > allowed) {
fad7906d 181 unsigned long range;
68a3a7fe
AK
182 /* Give them at least part of their hotadd memory upto hotadd_percent
183 It would be better to spread the limit out
184 over multiple hotplug areas, but that is too complicated
185 right now */
186 if (allocated >= allowed)
187 return 0;
fad7906d
AK
188 range = allowed - allocated;
189 pages = (range / PAGE_SIZE);
68a3a7fe 190 mem = pages * sizeof(struct page);
fad7906d 191 nd->end = nd->start + range;
68a3a7fe
AK
192 }
193 /* Not completely fool proof, but a good sanity check */
194 addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
195 if (addr == -1UL)
196 return 0;
197 if (pages != oldpages)
198 printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
199 pages << PAGE_SHIFT);
200 last_area_end = addr + mem;
201 allocated += mem;
202 return 1;
203}
204
205/*
206 * It is fine to add this area to the nodes data it will be used later
207 * This code supports one contigious hot add area per node.
208 */
209static int reserve_hotadd(int node, unsigned long start, unsigned long end)
210{
211 unsigned long s_pfn = start >> PAGE_SHIFT;
212 unsigned long e_pfn = end >> PAGE_SHIFT;
213 int changed = 0;
214 struct bootnode *nd = &nodes_add[node];
215
216 /* I had some trouble with strange memory hotadd regions breaking
217 the boot. Be very strict here and reject anything unexpected.
218 If you want working memory hotadd write correct SRATs.
219
220 The node size check is a basic sanity check to guard against
221 mistakes */
222 if ((signed long)(end - start) < NODE_MIN_SIZE) {
223 printk(KERN_ERR "SRAT: Hotplug area too small\n");
224 return -1;
225 }
226
227 /* This check might be a bit too strict, but I'm keeping it for now. */
228 if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
229 printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
230 return -1;
231 }
232
233 if (!hotadd_enough_memory(&nodes_add[node])) {
234 printk(KERN_ERR "SRAT: Hotplug area too large\n");
235 return -1;
236 }
237
238 /* Looks good */
239
240 found_add_area = 1;
241 if (nd->start == nd->end) {
242 nd->start = start;
243 nd->end = end;
244 changed = 1;
245 } else {
246 if (nd->start == end) {
247 nd->start = start;
248 changed = 1;
249 }
250 if (nd->end == start) {
251 nd->end = end;
252 changed = 1;
253 }
254 if (!changed)
255 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
256 }
257
258 if ((nd->end >> PAGE_SHIFT) > end_pfn)
259 end_pfn = nd->end >> PAGE_SHIFT;
260
261 if (changed)
262 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
263 return 0;
264}
265#endif
266
1da177e4
LT
267/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
268void __init
269acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
270{
68a3a7fe 271 struct bootnode *nd, oldnode;
1da177e4
LT
272 unsigned long start, end;
273 int node, pxm;
274 int i;
275
d22fe808 276 if (srat_disabled())
1da177e4 277 return;
d22fe808
AK
278 if (ma->header.length != sizeof(struct acpi_table_memory_affinity)) {
279 bad_srat();
280 return;
281 }
282 if (ma->flags.enabled == 0)
283 return;
68a3a7fe
AK
284 if (ma->flags.hot_pluggable && hotadd_percent == 0)
285 return;
d22fe808
AK
286 start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
287 end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
1da177e4
LT
288 pxm = ma->proximity_domain;
289 node = setup_node(pxm);
290 if (node < 0) {
291 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
292 bad_srat();
293 return;
294 }
1da177e4 295 i = conflicting_nodes(start, end);
05d1fa4b
AK
296 if (i == node) {
297 printk(KERN_WARNING
298 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
299 pxm, start, end, nodes[i].start, nodes[i].end);
300 } else if (i >= 0) {
1da177e4 301 printk(KERN_ERR
05d1fa4b
AK
302 "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
303 pxm, start, end, node_to_pxm(i),
304 nodes[i].start, nodes[i].end);
1da177e4
LT
305 bad_srat();
306 return;
307 }
308 nd = &nodes[node];
68a3a7fe 309 oldnode = *nd;
1da177e4
LT
310 if (!node_test_and_set(node, nodes_parsed)) {
311 nd->start = start;
312 nd->end = end;
313 } else {
314 if (start < nd->start)
315 nd->start = start;
316 if (nd->end < end)
317 nd->end = end;
318 }
68a3a7fe 319
1da177e4
LT
320 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
321 nd->start, nd->end);
68a3a7fe
AK
322
323#ifdef RESERVE_HOTADD
324 if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
325 /* Ignore hotadd region. Undo damage */
326 printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
327 *nd = oldnode;
328 if ((nd->start | nd->end) == 0)
329 node_clear(node, nodes_parsed);
330 }
331#endif
1da177e4
LT
332}
333
8a6fdd3e
AK
334/* Sanity check to catch more bad SRATs (they are amazingly common).
335 Make sure the PXMs cover all memory. */
336static int nodes_cover_memory(void)
337{
338 int i;
339 unsigned long pxmram, e820ram;
340
341 pxmram = 0;
342 for_each_node_mask(i, nodes_parsed) {
343 unsigned long s = nodes[i].start >> PAGE_SHIFT;
344 unsigned long e = nodes[i].end >> PAGE_SHIFT;
345 pxmram += e - s;
346 pxmram -= e820_hole_size(s, e);
68a3a7fe
AK
347 pxmram -= nodes_add[i].end - nodes_add[i].start;
348 if ((long)pxmram < 0)
349 pxmram = 0;
8a6fdd3e
AK
350 }
351
352 e820ram = end_pfn - e820_hole_size(0, end_pfn);
fdb9df94
AK
353 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
354 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
8a6fdd3e
AK
355 printk(KERN_ERR
356 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
357 (pxmram << PAGE_SHIFT) >> 20,
358 (e820ram << PAGE_SHIFT) >> 20);
359 return 0;
360 }
361 return 1;
362}
363
9391a3f9
AK
364static void unparse_node(int node)
365{
366 int i;
367 node_clear(node, nodes_parsed);
368 for (i = 0; i < MAX_LOCAL_APIC; i++) {
369 if (apicid_to_node[i] == node)
370 apicid_to_node[i] = NUMA_NO_NODE;
371 }
372}
373
1da177e4
LT
374void __init acpi_numa_arch_fixup(void) {}
375
376/* Use the information discovered above to actually set up the nodes. */
377int __init acpi_scan_nodes(unsigned long start, unsigned long end)
378{
379 int i;
8a6fdd3e 380
e58e0d03 381 /* First clean up the node list */
9391a3f9 382 for (i = 0; i < MAX_NUMNODES; i++) {
68a3a7fe 383 cutoff_node(i, start, end);
0d015324 384 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
9391a3f9 385 unparse_node(i);
0d015324
DY
386 node_set_offline(i);
387 }
e58e0d03
AK
388 }
389
9391a3f9
AK
390 if (acpi_numa <= 0)
391 return -1;
392
8a6fdd3e
AK
393 if (!nodes_cover_memory()) {
394 bad_srat();
395 return -1;
396 }
397
2aed711a 398 memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
1da177e4
LT
399 if (memnode_shift < 0) {
400 printk(KERN_ERR
401 "SRAT: No NUMA node hash function found. Contact maintainer\n");
402 bad_srat();
403 return -1;
404 }
e58e0d03
AK
405
406 /* Finally register nodes */
407 for_each_node_mask(i, nodes_parsed)
1da177e4 408 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
a8062231
AK
409 /* Try again in case setup_node_bootmem missed one due
410 to missing bootmem */
411 for_each_node_mask(i, nodes_parsed)
412 if (!node_online(i))
413 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
414
1da177e4
LT
415 for (i = 0; i < NR_CPUS; i++) {
416 if (cpu_to_node[i] == NUMA_NO_NODE)
417 continue;
418 if (!node_isset(cpu_to_node[i], nodes_parsed))
69d81fcd 419 numa_set_node(i, NUMA_NO_NODE);
1da177e4
LT
420 }
421 numa_init_array();
422 return 0;
423}
424
68a3a7fe
AK
425void __init srat_reserve_add_area(int nodeid)
426{
427 if (found_add_area && nodes_add[nodeid].end) {
428 u64 total_mb;
429
430 printk(KERN_INFO "SRAT: Reserving hot-add memory space "
431 "for node %d at %Lx-%Lx\n",
432 nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
433 total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
434 >> PAGE_SHIFT;
435 total_mb *= sizeof(struct page);
436 total_mb >>= 20;
437 printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
438 "pre-allocated memory.\n", (unsigned long long)total_mb);
439 reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
440 nodes_add[nodeid].end - nodes_add[nodeid].start);
441 }
442}
443
1da177e4
LT
444int __node_distance(int a, int b)
445{
446 int index;
447
448 if (!acpi_slit)
449 return a == b ? 10 : 20;
450 index = acpi_slit->localities * node_to_pxm(a);
451 return acpi_slit->entry[index + node_to_pxm(b)];
452}
453
454EXPORT_SYMBOL(__node_distance);
This page took 0.184744 seconds and 5 git commands to generate.