Commit | Line | Data |
---|---|---|
e3cfe529 | 1 | /* |
1da177e4 LT |
2 | * Generic VM initialization for x86-64 NUMA setups. |
3 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. | |
e3cfe529 | 4 | */ |
1da177e4 LT |
5 | #include <linux/kernel.h> |
6 | #include <linux/mm.h> | |
7 | #include <linux/string.h> | |
8 | #include <linux/init.h> | |
9 | #include <linux/bootmem.h> | |
72d7c3b3 | 10 | #include <linux/memblock.h> |
1da177e4 LT |
11 | #include <linux/mmzone.h> |
12 | #include <linux/ctype.h> | |
13 | #include <linux/module.h> | |
14 | #include <linux/nodemask.h> | |
3cc87e3f | 15 | #include <linux/sched.h> |
d8fc3afc | 16 | #include <linux/acpi.h> |
1da177e4 LT |
17 | |
18 | #include <asm/e820.h> | |
19 | #include <asm/proto.h> | |
20 | #include <asm/dma.h> | |
1da177e4 | 21 | #include <asm/acpi.h> |
23ac4ae8 | 22 | #include <asm/amd_nb.h> |
1da177e4 | 23 | |
b8ef9172 | 24 | #include "numa_internal.h" |
97e7b78d | 25 | |
6c231b7b | 26 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; |
e3cfe529 TG |
27 | EXPORT_SYMBOL(node_data); |
28 | ||
92d4a437 | 29 | nodemask_t numa_nodes_parsed __initdata; |
ec8cf29b | 30 | |
dcf36bfa | 31 | struct memnode memnode; |
1da177e4 | 32 | |
864fc31e TG |
33 | static unsigned long __initdata nodemap_addr; |
34 | static unsigned long __initdata nodemap_size; | |
1da177e4 | 35 | |
97e7b78d | 36 | static struct numa_meminfo numa_meminfo __initdata; |
ef396ec9 | 37 | |
ac7136b6 TH |
38 | static int numa_distance_cnt; |
39 | static u8 *numa_distance; | |
40 | ||
529a3404 ED |
41 | /* |
42 | * Given a shift value, try to populate memnodemap[] | |
43 | * Returns : | |
44 | * 1 if OK | |
45 | * 0 if memnodmap[] too small (of shift too small) | |
46 | * -1 if node overlap or lost ram (shift too big) | |
47 | */ | |
97e7b78d | 48 | static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift) |
1da177e4 | 49 | { |
529a3404 | 50 | unsigned long addr, end; |
e3cfe529 | 51 | int i, res = -1; |
b684664f | 52 | |
43238382 | 53 | memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); |
97e7b78d TH |
54 | for (i = 0; i < mi->nr_blks; i++) { |
55 | addr = mi->blk[i].start; | |
56 | end = mi->blk[i].end; | |
529a3404 | 57 | if (addr >= end) |
b684664f | 58 | continue; |
076422d2 | 59 | if ((end >> shift) >= memnodemapsize) |
529a3404 ED |
60 | return 0; |
61 | do { | |
43238382 | 62 | if (memnodemap[addr >> shift] != NUMA_NO_NODE) |
b684664f | 63 | return -1; |
97e7b78d | 64 | memnodemap[addr >> shift] = mi->blk[i].nid; |
076422d2 | 65 | addr += (1UL << shift); |
529a3404 ED |
66 | } while (addr < end); |
67 | res = 1; | |
e3cfe529 | 68 | } |
529a3404 ED |
69 | return res; |
70 | } | |
71 | ||
076422d2 AS |
72 | static int __init allocate_cachealigned_memnodemap(void) |
73 | { | |
24a5da73 | 74 | unsigned long addr; |
076422d2 AS |
75 | |
76 | memnodemap = memnode.embedded_map; | |
316390b0 | 77 | if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) |
076422d2 | 78 | return 0; |
076422d2 | 79 | |
24a5da73 | 80 | addr = 0x8000; |
be3e89ee | 81 | nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); |
dbef7b56 | 82 | nodemap_addr = memblock_find_in_range(addr, get_max_mapped(), |
24a5da73 | 83 | nodemap_size, L1_CACHE_BYTES); |
a9ce6bc1 | 84 | if (nodemap_addr == MEMBLOCK_ERROR) { |
076422d2 AS |
85 | printk(KERN_ERR |
86 | "NUMA: Unable to allocate Memory to Node hash map\n"); | |
87 | nodemap_addr = nodemap_size = 0; | |
88 | return -1; | |
89 | } | |
24a5da73 | 90 | memnodemap = phys_to_virt(nodemap_addr); |
a9ce6bc1 | 91 | memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); |
076422d2 AS |
92 | |
93 | printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", | |
94 | nodemap_addr, nodemap_addr + nodemap_size); | |
95 | return 0; | |
96 | } | |
97 | ||
98 | /* | |
99 | * The LSB of all start and end addresses in the node map is the value of the | |
100 | * maximum possible shift. | |
101 | */ | |
97e7b78d | 102 | static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi) |
529a3404 | 103 | { |
54413927 | 104 | int i, nodes_used = 0; |
076422d2 AS |
105 | unsigned long start, end; |
106 | unsigned long bitfield = 0, memtop = 0; | |
107 | ||
97e7b78d TH |
108 | for (i = 0; i < mi->nr_blks; i++) { |
109 | start = mi->blk[i].start; | |
110 | end = mi->blk[i].end; | |
076422d2 AS |
111 | if (start >= end) |
112 | continue; | |
54413927 AS |
113 | bitfield |= start; |
114 | nodes_used++; | |
076422d2 AS |
115 | if (end > memtop) |
116 | memtop = end; | |
117 | } | |
54413927 AS |
118 | if (nodes_used <= 1) |
119 | i = 63; | |
120 | else | |
121 | i = find_first_bit(&bitfield, sizeof(unsigned long)*8); | |
076422d2 AS |
122 | memnodemapsize = (memtop >> i)+1; |
123 | return i; | |
124 | } | |
529a3404 | 125 | |
97e7b78d | 126 | static int __init compute_hash_shift(const struct numa_meminfo *mi) |
076422d2 AS |
127 | { |
128 | int shift; | |
529a3404 | 129 | |
97e7b78d | 130 | shift = extract_lsb_from_nodes(mi); |
076422d2 AS |
131 | if (allocate_cachealigned_memnodemap()) |
132 | return -1; | |
6b050f80 | 133 | printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", |
529a3404 ED |
134 | shift); |
135 | ||
97e7b78d | 136 | if (populate_memnodemap(mi, shift) != 1) { |
e3cfe529 TG |
137 | printk(KERN_INFO "Your memory is not aligned you need to " |
138 | "rebuild your kernel with a bigger NODEMAPSIZE " | |
139 | "shift=%d\n", shift); | |
529a3404 ED |
140 | return -1; |
141 | } | |
b684664f | 142 | return shift; |
1da177e4 LT |
143 | } |
144 | ||
f2dbcfa7 | 145 | int __meminit __early_pfn_to_nid(unsigned long pfn) |
bbfceef4 MT |
146 | { |
147 | return phys_to_nid(pfn << PAGE_SHIFT); | |
148 | } | |
bbfceef4 | 149 | |
e3cfe529 | 150 | static void * __init early_node_mem(int nodeid, unsigned long start, |
24a5da73 YL |
151 | unsigned long end, unsigned long size, |
152 | unsigned long align) | |
a8062231 | 153 | { |
cef625ee | 154 | unsigned long mem; |
e3cfe529 | 155 | |
cef625ee YL |
156 | /* |
157 | * put it on high as possible | |
158 | * something will go with NODE_DATA | |
159 | */ | |
160 | if (start < (MAX_DMA_PFN<<PAGE_SHIFT)) | |
161 | start = MAX_DMA_PFN<<PAGE_SHIFT; | |
162 | if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) && | |
163 | end > (MAX_DMA32_PFN<<PAGE_SHIFT)) | |
164 | start = MAX_DMA32_PFN<<PAGE_SHIFT; | |
72d7c3b3 YL |
165 | mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align); |
166 | if (mem != MEMBLOCK_ERROR) | |
a8062231 | 167 | return __va(mem); |
9347e0b0 | 168 | |
cef625ee YL |
169 | /* extend the search scope */ |
170 | end = max_pfn_mapped << PAGE_SHIFT; | |
419db274 YL |
171 | start = MAX_DMA_PFN << PAGE_SHIFT; |
172 | mem = memblock_find_in_range(start, end, size, align); | |
72d7c3b3 | 173 | if (mem != MEMBLOCK_ERROR) |
a8062231 | 174 | return __va(mem); |
9347e0b0 | 175 | |
1842f90c | 176 | printk(KERN_ERR "Cannot find %lu bytes in node %d\n", |
e3cfe529 | 177 | size, nodeid); |
1842f90c YL |
178 | |
179 | return NULL; | |
a8062231 AK |
180 | } |
181 | ||
d9c515ea TH |
182 | static int __init numa_add_memblk_to(int nid, u64 start, u64 end, |
183 | struct numa_meminfo *mi) | |
ef396ec9 | 184 | { |
56e827fb TH |
185 | /* ignore zero length blks */ |
186 | if (start == end) | |
187 | return 0; | |
97e7b78d | 188 | |
56e827fb TH |
189 | /* whine about and ignore invalid blks */ |
190 | if (start > end || nid < 0 || nid >= MAX_NUMNODES) { | |
191 | pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", | |
192 | nid, start, end); | |
193 | return 0; | |
ef396ec9 | 194 | } |
ef396ec9 | 195 | |
56e827fb TH |
196 | if (mi->nr_blks >= NR_NODE_MEMBLKS) { |
197 | pr_err("NUMA: too many memblk ranges\n"); | |
ef396ec9 TH |
198 | return -EINVAL; |
199 | } | |
200 | ||
97e7b78d TH |
201 | mi->blk[mi->nr_blks].start = start; |
202 | mi->blk[mi->nr_blks].end = end; | |
203 | mi->blk[mi->nr_blks].nid = nid; | |
204 | mi->nr_blks++; | |
ef396ec9 TH |
205 | return 0; |
206 | } | |
207 | ||
90e6b677 TH |
208 | /** |
209 | * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo | |
210 | * @idx: Index of memblk to remove | |
211 | * @mi: numa_meminfo to remove memblk from | |
212 | * | |
213 | * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and | |
214 | * decrementing @mi->nr_blks. | |
215 | */ | |
b8ef9172 | 216 | void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) |
2e756be4 TH |
217 | { |
218 | mi->nr_blks--; | |
219 | memmove(&mi->blk[idx], &mi->blk[idx + 1], | |
220 | (mi->nr_blks - idx) * sizeof(mi->blk[0])); | |
221 | } | |
222 | ||
90e6b677 TH |
223 | /** |
224 | * numa_add_memblk - Add one numa_memblk to numa_meminfo | |
225 | * @nid: NUMA node ID of the new memblk | |
226 | * @start: Start address of the new memblk | |
227 | * @end: End address of the new memblk | |
228 | * | |
229 | * Add a new memblk to the default numa_meminfo. | |
230 | * | |
231 | * RETURNS: | |
232 | * 0 on success, -errno on failure. | |
233 | */ | |
d9c515ea TH |
234 | int __init numa_add_memblk(int nid, u64 start, u64 end) |
235 | { | |
236 | return numa_add_memblk_to(nid, start, end, &numa_meminfo); | |
237 | } | |
238 | ||
1da177e4 | 239 | /* Initialize bootmem allocator for a node */ |
7c43769a YL |
240 | void __init |
241 | setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | |
e3cfe529 | 242 | { |
08677214 | 243 | unsigned long start_pfn, last_pfn, nodedata_phys; |
7c43769a | 244 | const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); |
1a27fc0a | 245 | int nid; |
1da177e4 | 246 | |
4c31e92b YL |
247 | if (!end) |
248 | return; | |
249 | ||
7c43769a YL |
250 | /* |
251 | * Don't confuse VM with a node that doesn't have the | |
252 | * minimum amount of memory: | |
253 | */ | |
254 | if (end && (end - start) < NODE_MIN_SIZE) | |
255 | return; | |
256 | ||
be3e89ee | 257 | start = roundup(start, ZONE_ALIGN); |
1da177e4 | 258 | |
08677214 | 259 | printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid, |
e3cfe529 | 260 | start, end); |
1da177e4 LT |
261 | |
262 | start_pfn = start >> PAGE_SHIFT; | |
886533a3 | 263 | last_pfn = end >> PAGE_SHIFT; |
1da177e4 | 264 | |
24a5da73 YL |
265 | node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, |
266 | SMP_CACHE_BYTES); | |
a8062231 AK |
267 | if (node_data[nodeid] == NULL) |
268 | return; | |
269 | nodedata_phys = __pa(node_data[nodeid]); | |
a9ce6bc1 | 270 | memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA"); |
6118f76f YL |
271 | printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, |
272 | nodedata_phys + pgdat_size - 1); | |
1842f90c YL |
273 | nid = phys_to_nid(nodedata_phys); |
274 | if (nid != nodeid) | |
275 | printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); | |
1da177e4 | 276 | |
1da177e4 | 277 | memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); |
08677214 | 278 | NODE_DATA(nodeid)->node_id = nodeid; |
1da177e4 | 279 | NODE_DATA(nodeid)->node_start_pfn = start_pfn; |
886533a3 | 280 | NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; |
1da177e4 | 281 | |
1da177e4 | 282 | node_set_online(nodeid); |
e3cfe529 | 283 | } |
1da177e4 | 284 | |
90e6b677 TH |
285 | /** |
286 | * numa_cleanup_meminfo - Cleanup a numa_meminfo | |
287 | * @mi: numa_meminfo to clean up | |
288 | * | |
289 | * Sanitize @mi by merging and removing unncessary memblks. Also check for | |
290 | * conflicts and clear unused memblks. | |
291 | * | |
292 | * RETURNS: | |
293 | * 0 on success, -errno on failure. | |
294 | */ | |
b8ef9172 | 295 | int __init numa_cleanup_meminfo(struct numa_meminfo *mi) |
fd0435d8 | 296 | { |
56e827fb TH |
297 | const u64 low = 0; |
298 | const u64 high = (u64)max_pfn << PAGE_SHIFT; | |
2e756be4 | 299 | int i, j, k; |
ef396ec9 | 300 | |
2e756be4 | 301 | for (i = 0; i < mi->nr_blks; i++) { |
97e7b78d | 302 | struct numa_memblk *bi = &mi->blk[i]; |
ef396ec9 | 303 | |
56e827fb TH |
304 | /* make sure all blocks are inside the limits */ |
305 | bi->start = max(bi->start, low); | |
306 | bi->end = min(bi->end, high); | |
307 | ||
308 | /* and there's no empty block */ | |
309 | if (bi->start == bi->end) { | |
310 | numa_remove_memblk_from(i--, mi); | |
311 | continue; | |
312 | } | |
313 | ||
2e756be4 | 314 | for (j = i + 1; j < mi->nr_blks; j++) { |
97e7b78d | 315 | struct numa_memblk *bj = &mi->blk[j]; |
ef396ec9 TH |
316 | unsigned long start, end; |
317 | ||
56e827fb TH |
318 | /* |
319 | * See whether there are overlapping blocks. Whine | |
320 | * about but allow overlaps of the same nid. They | |
321 | * will be merged below. | |
322 | */ | |
323 | if (bi->end > bj->start && bi->start < bj->end) { | |
324 | if (bi->nid != bj->nid) { | |
325 | pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", | |
326 | bi->nid, bi->start, bi->end, | |
327 | bj->nid, bj->start, bj->end); | |
328 | return -EINVAL; | |
329 | } | |
330 | pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", | |
331 | bi->nid, bi->start, bi->end, | |
332 | bj->start, bj->end); | |
333 | } | |
334 | ||
2e756be4 TH |
335 | /* |
336 | * Join together blocks on the same node, holes | |
337 | * between which don't overlap with memory on other | |
338 | * nodes. | |
339 | */ | |
97e7b78d | 340 | if (bi->nid != bj->nid) |
ef396ec9 | 341 | continue; |
56e827fb TH |
342 | start = max(min(bi->start, bj->start), low); |
343 | end = min(max(bi->end, bj->end), high); | |
2e756be4 | 344 | for (k = 0; k < mi->nr_blks; k++) { |
97e7b78d TH |
345 | struct numa_memblk *bk = &mi->blk[k]; |
346 | ||
347 | if (bi->nid == bk->nid) | |
ef396ec9 | 348 | continue; |
97e7b78d | 349 | if (start < bk->end && end > bk->start) |
ef396ec9 TH |
350 | break; |
351 | } | |
97e7b78d | 352 | if (k < mi->nr_blks) |
ef396ec9 | 353 | continue; |
ef396ec9 | 354 | printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", |
97e7b78d | 355 | bi->nid, bi->start, bi->end, bj->start, bj->end, |
ef396ec9 | 356 | start, end); |
97e7b78d TH |
357 | bi->start = start; |
358 | bi->end = end; | |
2e756be4 | 359 | numa_remove_memblk_from(j--, mi); |
ef396ec9 TH |
360 | } |
361 | } | |
362 | ||
56e827fb TH |
363 | for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { |
364 | mi->blk[i].start = mi->blk[i].end = 0; | |
365 | mi->blk[i].nid = NUMA_NO_NODE; | |
366 | } | |
367 | ||
f9c60251 TH |
368 | return 0; |
369 | } | |
370 | ||
4697bdcc TH |
371 | /* |
372 | * Set nodes, which have memory in @mi, in *@nodemask. | |
373 | */ | |
374 | static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, | |
375 | const struct numa_meminfo *mi) | |
376 | { | |
377 | int i; | |
378 | ||
379 | for (i = 0; i < ARRAY_SIZE(mi->blk); i++) | |
380 | if (mi->blk[i].start != mi->blk[i].end && | |
381 | mi->blk[i].nid != NUMA_NO_NODE) | |
382 | node_set(mi->blk[i].nid, *nodemask); | |
383 | } | |
384 | ||
90e6b677 TH |
385 | /** |
386 | * numa_reset_distance - Reset NUMA distance table | |
387 | * | |
388 | * The current table is freed. The next numa_set_distance() call will | |
389 | * create a new one. | |
ac7136b6 | 390 | */ |
b8ef9172 | 391 | void __init numa_reset_distance(void) |
ac7136b6 TH |
392 | { |
393 | size_t size; | |
394 | ||
2ca230ba YL |
395 | if (numa_distance_cnt) { |
396 | size = numa_distance_cnt * sizeof(numa_distance[0]); | |
397 | memblock_x86_free_range(__pa(numa_distance), | |
398 | __pa(numa_distance) + size); | |
399 | numa_distance_cnt = 0; | |
400 | } | |
ac7136b6 | 401 | numa_distance = NULL; |
ac7136b6 TH |
402 | } |
403 | ||
2bf50555 YL |
404 | static int __init numa_alloc_distance(void) |
405 | { | |
406 | nodemask_t nodes_parsed; | |
407 | size_t size; | |
408 | int i, j, cnt = 0; | |
409 | u64 phys; | |
410 | ||
411 | /* size the new table and allocate it */ | |
412 | nodes_parsed = numa_nodes_parsed; | |
413 | numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); | |
414 | ||
415 | for_each_node_mask(i, nodes_parsed) | |
416 | cnt = i; | |
1f565a89 DR |
417 | cnt++; |
418 | size = cnt * cnt * sizeof(numa_distance[0]); | |
2bf50555 YL |
419 | |
420 | phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT, | |
421 | size, PAGE_SIZE); | |
422 | if (phys == MEMBLOCK_ERROR) { | |
423 | pr_warning("NUMA: Warning: can't allocate distance table!\n"); | |
424 | /* don't retry until explicitly reset */ | |
425 | numa_distance = (void *)1LU; | |
426 | return -ENOMEM; | |
427 | } | |
428 | memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); | |
429 | ||
430 | numa_distance = __va(phys); | |
431 | numa_distance_cnt = cnt; | |
432 | ||
433 | /* fill with the default distances */ | |
434 | for (i = 0; i < cnt; i++) | |
435 | for (j = 0; j < cnt; j++) | |
436 | numa_distance[i * cnt + j] = i == j ? | |
437 | LOCAL_DISTANCE : REMOTE_DISTANCE; | |
438 | printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); | |
439 | ||
440 | return 0; | |
441 | } | |
442 | ||
90e6b677 TH |
443 | /** |
444 | * numa_set_distance - Set NUMA distance from one NUMA to another | |
445 | * @from: the 'from' node to set distance | |
446 | * @to: the 'to' node to set distance | |
447 | * @distance: NUMA distance | |
448 | * | |
449 | * Set the distance from node @from to @to to @distance. If distance table | |
450 | * doesn't exist, one which is large enough to accomodate all the currently | |
451 | * known nodes will be created. | |
ac7136b6 TH |
452 | */ |
453 | void __init numa_set_distance(int from, int to, int distance) | |
454 | { | |
2bf50555 YL |
455 | if (!numa_distance && numa_alloc_distance() < 0) |
456 | return; | |
ac7136b6 TH |
457 | |
458 | if (from >= numa_distance_cnt || to >= numa_distance_cnt) { | |
459 | printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", | |
460 | from, to, distance); | |
461 | return; | |
462 | } | |
463 | ||
464 | if ((u8)distance != distance || | |
465 | (from == to && distance != LOCAL_DISTANCE)) { | |
466 | pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", | |
467 | from, to, distance); | |
468 | return; | |
469 | } | |
470 | ||
471 | numa_distance[from * numa_distance_cnt + to] = distance; | |
472 | } | |
473 | ||
474 | int __node_distance(int from, int to) | |
475 | { | |
ac7136b6 TH |
476 | if (from >= numa_distance_cnt || to >= numa_distance_cnt) |
477 | return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; | |
478 | return numa_distance[from * numa_distance_cnt + to]; | |
479 | } | |
480 | EXPORT_SYMBOL(__node_distance); | |
481 | ||
f9c60251 TH |
482 | /* |
483 | * Sanity check to catch more bad NUMA configurations (they are amazingly | |
484 | * common). Make sure the nodes cover all memory. | |
485 | */ | |
91556237 | 486 | static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) |
f9c60251 TH |
487 | { |
488 | unsigned long numaram, e820ram; | |
489 | int i; | |
490 | ||
491 | numaram = 0; | |
91556237 TH |
492 | for (i = 0; i < mi->nr_blks; i++) { |
493 | unsigned long s = mi->blk[i].start >> PAGE_SHIFT; | |
494 | unsigned long e = mi->blk[i].end >> PAGE_SHIFT; | |
f9c60251 | 495 | numaram += e - s; |
91556237 | 496 | numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); |
f9c60251 TH |
497 | if ((long)numaram < 0) |
498 | numaram = 0; | |
499 | } | |
500 | ||
501 | e820ram = max_pfn - (memblock_x86_hole_size(0, | |
502 | max_pfn << PAGE_SHIFT) >> PAGE_SHIFT); | |
503 | /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ | |
504 | if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { | |
505 | printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n", | |
506 | (numaram << PAGE_SHIFT) >> 20, | |
507 | (e820ram << PAGE_SHIFT) >> 20); | |
91556237 | 508 | return false; |
f9c60251 | 509 | } |
91556237 | 510 | return true; |
f9c60251 TH |
511 | } |
512 | ||
513 | static int __init numa_register_memblks(struct numa_meminfo *mi) | |
514 | { | |
69efcc6d | 515 | int i, nid; |
f9c60251 TH |
516 | |
517 | /* Account for nodes with cpus and no memory */ | |
4697bdcc TH |
518 | node_possible_map = numa_nodes_parsed; |
519 | numa_nodemask_from_meminfo(&node_possible_map, mi); | |
f9c60251 TH |
520 | if (WARN_ON(nodes_empty(node_possible_map))) |
521 | return -EINVAL; | |
522 | ||
97e7b78d | 523 | memnode_shift = compute_hash_shift(mi); |
ef396ec9 TH |
524 | if (memnode_shift < 0) { |
525 | printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n"); | |
526 | return -EINVAL; | |
527 | } | |
528 | ||
97e7b78d TH |
529 | for (i = 0; i < mi->nr_blks; i++) |
530 | memblock_x86_register_active_regions(mi->blk[i].nid, | |
531 | mi->blk[i].start >> PAGE_SHIFT, | |
532 | mi->blk[i].end >> PAGE_SHIFT); | |
fd0435d8 TH |
533 | |
534 | /* for out of order entries */ | |
535 | sort_node_map(); | |
91556237 | 536 | if (!numa_meminfo_cover_memory(mi)) |
fd0435d8 TH |
537 | return -EINVAL; |
538 | ||
539 | init_memory_mapping_high(); | |
540 | ||
69efcc6d YL |
541 | /* Finally register nodes. */ |
542 | for_each_node_mask(nid, node_possible_map) { | |
543 | u64 start = (u64)max_pfn << PAGE_SHIFT; | |
544 | u64 end = 0; | |
91556237 | 545 | |
69efcc6d YL |
546 | for (i = 0; i < mi->nr_blks; i++) { |
547 | if (nid != mi->blk[i].nid) | |
91556237 | 548 | continue; |
69efcc6d YL |
549 | start = min(mi->blk[i].start, start); |
550 | end = max(mi->blk[i].end, end); | |
91556237 | 551 | } |
69efcc6d YL |
552 | |
553 | if (start < end) | |
554 | setup_node_bootmem(nid, start, end); | |
91556237 | 555 | } |
fd0435d8 | 556 | |
ef396ec9 TH |
557 | return 0; |
558 | } | |
559 | ||
6d496f9f | 560 | static int __init dummy_numa_init(void) |
ffe77a46 | 561 | { |
1da177e4 LT |
562 | printk(KERN_INFO "%s\n", |
563 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); | |
e3cfe529 | 564 | printk(KERN_INFO "Faking a node at %016lx-%016lx\n", |
86ef4dbf | 565 | 0LU, max_pfn << PAGE_SHIFT); |
ffe77a46 | 566 | |
92d4a437 | 567 | node_set(0, numa_nodes_parsed); |
43a662f0 | 568 | numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT); |
ec8cf29b TH |
569 | |
570 | return 0; | |
571 | } | |
572 | ||
ffe77a46 TH |
573 | void __init initmem_init(void) |
574 | { | |
575 | int (*numa_init[])(void) = { [2] = dummy_numa_init }; | |
ffe77a46 TH |
576 | int i, j; |
577 | ||
578 | if (!numa_off) { | |
579 | #ifdef CONFIG_ACPI_NUMA | |
580 | numa_init[0] = x86_acpi_numa_init; | |
ffe77a46 TH |
581 | #endif |
582 | #ifdef CONFIG_AMD_NUMA | |
583 | numa_init[1] = amd_numa_init; | |
ffe77a46 TH |
584 | #endif |
585 | } | |
586 | ||
587 | for (i = 0; i < ARRAY_SIZE(numa_init); i++) { | |
588 | if (!numa_init[i]) | |
589 | continue; | |
590 | ||
591 | for (j = 0; j < MAX_LOCAL_APIC; j++) | |
592 | set_apicid_to_node(j, NUMA_NO_NODE); | |
593 | ||
92d4a437 | 594 | nodes_clear(numa_nodes_parsed); |
ffe77a46 TH |
595 | nodes_clear(node_possible_map); |
596 | nodes_clear(node_online_map); | |
97e7b78d | 597 | memset(&numa_meminfo, 0, sizeof(numa_meminfo)); |
fd0435d8 | 598 | remove_all_active_ranges(); |
ac7136b6 | 599 | numa_reset_distance(); |
ffe77a46 TH |
600 | |
601 | if (numa_init[i]() < 0) | |
602 | continue; | |
206e4208 | 603 | |
56e827fb TH |
604 | if (numa_cleanup_meminfo(&numa_meminfo) < 0) |
605 | continue; | |
fbe99959 TH |
606 | |
607 | numa_emulation(&numa_meminfo, numa_distance_cnt); | |
608 | ||
f9c60251 | 609 | if (numa_register_memblks(&numa_meminfo) < 0) |
43a662f0 TH |
610 | continue; |
611 | ||
fd0435d8 TH |
612 | for (j = 0; j < nr_cpu_ids; j++) { |
613 | int nid = early_cpu_to_node(j); | |
614 | ||
615 | if (nid == NUMA_NO_NODE) | |
616 | continue; | |
617 | if (!node_online(nid)) | |
618 | numa_clear_node(j); | |
619 | } | |
620 | numa_init_array(); | |
621 | return; | |
ffe77a46 TH |
622 | } |
623 | BUG(); | |
69d81fcd AK |
624 | } |
625 | ||
e3cfe529 TG |
626 | unsigned long __init numa_free_all_bootmem(void) |
627 | { | |
1da177e4 | 628 | unsigned long pages = 0; |
e3cfe529 TG |
629 | int i; |
630 | ||
631 | for_each_online_node(i) | |
1da177e4 | 632 | pages += free_all_bootmem_node(NODE_DATA(i)); |
e3cfe529 | 633 | |
08677214 | 634 | pages += free_all_memory_core_early(MAX_NUMNODES); |
08677214 | 635 | |
1da177e4 | 636 | return pages; |
e3cfe529 | 637 | } |
1da177e4 | 638 | |
bbc9e2f4 | 639 | int __cpuinit numa_cpu_node(int cpu) |
d9c2d5ac | 640 | { |
bbc9e2f4 | 641 | int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); |
d9c2d5ac | 642 | |
bbc9e2f4 TH |
643 | if (apicid != BAD_APICID) |
644 | return __apicid_to_node[apicid]; | |
645 | return NUMA_NO_NODE; | |
d9c2d5ac | 646 | } |