2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 #include <linux/kernel.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/memblock.h>
11 #include <linux/mmzone.h>
12 #include <linux/ctype.h>
13 #include <linux/module.h>
14 #include <linux/nodemask.h>
15 #include <linux/sched.h>
16 #include <linux/acpi.h>
19 #include <asm/proto.h>
22 #include <asm/amd_nb.h>
24 #include "numa_internal.h"
26 struct pglist_data
*node_data
[MAX_NUMNODES
] __read_mostly
;
27 EXPORT_SYMBOL(node_data
);
29 nodemask_t numa_nodes_parsed __initdata
;
31 static struct numa_meminfo numa_meminfo __initdata
;
32 static int numa_distance_cnt
;
33 static u8
*numa_distance
;
35 static void * __init
early_node_mem(int nodeid
, unsigned long start
,
36 unsigned long end
, unsigned long size
,
42 * put it on high as possible
43 * something will go with NODE_DATA
45 if (start
< (MAX_DMA_PFN
<<PAGE_SHIFT
))
46 start
= MAX_DMA_PFN
<<PAGE_SHIFT
;
47 if (start
< (MAX_DMA32_PFN
<<PAGE_SHIFT
) &&
48 end
> (MAX_DMA32_PFN
<<PAGE_SHIFT
))
49 start
= MAX_DMA32_PFN
<<PAGE_SHIFT
;
50 mem
= memblock_x86_find_in_range_node(nodeid
, start
, end
, size
, align
);
51 if (mem
!= MEMBLOCK_ERROR
)
54 /* extend the search scope */
55 end
= max_pfn_mapped
<< PAGE_SHIFT
;
56 start
= MAX_DMA_PFN
<< PAGE_SHIFT
;
57 mem
= memblock_find_in_range(start
, end
, size
, align
);
58 if (mem
!= MEMBLOCK_ERROR
)
61 printk(KERN_ERR
"Cannot find %lu bytes in node %d\n",
67 static int __init
numa_add_memblk_to(int nid
, u64 start
, u64 end
,
68 struct numa_meminfo
*mi
)
70 /* ignore zero length blks */
74 /* whine about and ignore invalid blks */
75 if (start
> end
|| nid
< 0 || nid
>= MAX_NUMNODES
) {
76 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
81 if (mi
->nr_blks
>= NR_NODE_MEMBLKS
) {
82 pr_err("NUMA: too many memblk ranges\n");
86 mi
->blk
[mi
->nr_blks
].start
= start
;
87 mi
->blk
[mi
->nr_blks
].end
= end
;
88 mi
->blk
[mi
->nr_blks
].nid
= nid
;
94 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
95 * @idx: Index of memblk to remove
96 * @mi: numa_meminfo to remove memblk from
98 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
99 * decrementing @mi->nr_blks.
101 void __init
numa_remove_memblk_from(int idx
, struct numa_meminfo
*mi
)
104 memmove(&mi
->blk
[idx
], &mi
->blk
[idx
+ 1],
105 (mi
->nr_blks
- idx
) * sizeof(mi
->blk
[0]));
109 * numa_add_memblk - Add one numa_memblk to numa_meminfo
110 * @nid: NUMA node ID of the new memblk
111 * @start: Start address of the new memblk
112 * @end: End address of the new memblk
114 * Add a new memblk to the default numa_meminfo.
117 * 0 on success, -errno on failure.
119 int __init
numa_add_memblk(int nid
, u64 start
, u64 end
)
121 return numa_add_memblk_to(nid
, start
, end
, &numa_meminfo
);
124 /* Initialize bootmem allocator for a node */
126 setup_node_bootmem(int nodeid
, unsigned long start
, unsigned long end
)
128 unsigned long start_pfn
, last_pfn
, nodedata_phys
;
129 const int pgdat_size
= roundup(sizeof(pg_data_t
), PAGE_SIZE
);
136 * Don't confuse VM with a node that doesn't have the
137 * minimum amount of memory:
139 if (end
&& (end
- start
) < NODE_MIN_SIZE
)
142 start
= roundup(start
, ZONE_ALIGN
);
144 printk(KERN_INFO
"Initmem setup node %d %016lx-%016lx\n", nodeid
,
147 start_pfn
= start
>> PAGE_SHIFT
;
148 last_pfn
= end
>> PAGE_SHIFT
;
150 node_data
[nodeid
] = early_node_mem(nodeid
, start
, end
, pgdat_size
,
152 if (node_data
[nodeid
] == NULL
)
154 nodedata_phys
= __pa(node_data
[nodeid
]);
155 memblock_x86_reserve_range(nodedata_phys
, nodedata_phys
+ pgdat_size
, "NODE_DATA");
156 printk(KERN_INFO
" NODE_DATA [%016lx - %016lx]\n", nodedata_phys
,
157 nodedata_phys
+ pgdat_size
- 1);
158 nid
= early_pfn_to_nid(nodedata_phys
>> PAGE_SHIFT
);
160 printk(KERN_INFO
" NODE_DATA(%d) on node %d\n", nodeid
, nid
);
162 memset(NODE_DATA(nodeid
), 0, sizeof(pg_data_t
));
163 NODE_DATA(nodeid
)->node_id
= nodeid
;
164 NODE_DATA(nodeid
)->node_start_pfn
= start_pfn
;
165 NODE_DATA(nodeid
)->node_spanned_pages
= last_pfn
- start_pfn
;
167 node_set_online(nodeid
);
171 * numa_cleanup_meminfo - Cleanup a numa_meminfo
172 * @mi: numa_meminfo to clean up
174 * Sanitize @mi by merging and removing unncessary memblks. Also check for
175 * conflicts and clear unused memblks.
178 * 0 on success, -errno on failure.
180 int __init
numa_cleanup_meminfo(struct numa_meminfo
*mi
)
183 const u64 high
= (u64
)max_pfn
<< PAGE_SHIFT
;
186 for (i
= 0; i
< mi
->nr_blks
; i
++) {
187 struct numa_memblk
*bi
= &mi
->blk
[i
];
189 /* make sure all blocks are inside the limits */
190 bi
->start
= max(bi
->start
, low
);
191 bi
->end
= min(bi
->end
, high
);
193 /* and there's no empty block */
194 if (bi
->start
== bi
->end
) {
195 numa_remove_memblk_from(i
--, mi
);
199 for (j
= i
+ 1; j
< mi
->nr_blks
; j
++) {
200 struct numa_memblk
*bj
= &mi
->blk
[j
];
201 unsigned long start
, end
;
204 * See whether there are overlapping blocks. Whine
205 * about but allow overlaps of the same nid. They
206 * will be merged below.
208 if (bi
->end
> bj
->start
&& bi
->start
< bj
->end
) {
209 if (bi
->nid
!= bj
->nid
) {
210 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
211 bi
->nid
, bi
->start
, bi
->end
,
212 bj
->nid
, bj
->start
, bj
->end
);
215 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
216 bi
->nid
, bi
->start
, bi
->end
,
221 * Join together blocks on the same node, holes
222 * between which don't overlap with memory on other
225 if (bi
->nid
!= bj
->nid
)
227 start
= max(min(bi
->start
, bj
->start
), low
);
228 end
= min(max(bi
->end
, bj
->end
), high
);
229 for (k
= 0; k
< mi
->nr_blks
; k
++) {
230 struct numa_memblk
*bk
= &mi
->blk
[k
];
232 if (bi
->nid
== bk
->nid
)
234 if (start
< bk
->end
&& end
> bk
->start
)
239 printk(KERN_INFO
"NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
240 bi
->nid
, bi
->start
, bi
->end
, bj
->start
, bj
->end
,
244 numa_remove_memblk_from(j
--, mi
);
248 for (i
= mi
->nr_blks
; i
< ARRAY_SIZE(mi
->blk
); i
++) {
249 mi
->blk
[i
].start
= mi
->blk
[i
].end
= 0;
250 mi
->blk
[i
].nid
= NUMA_NO_NODE
;
257 * Set nodes, which have memory in @mi, in *@nodemask.
259 static void __init
numa_nodemask_from_meminfo(nodemask_t
*nodemask
,
260 const struct numa_meminfo
*mi
)
264 for (i
= 0; i
< ARRAY_SIZE(mi
->blk
); i
++)
265 if (mi
->blk
[i
].start
!= mi
->blk
[i
].end
&&
266 mi
->blk
[i
].nid
!= NUMA_NO_NODE
)
267 node_set(mi
->blk
[i
].nid
, *nodemask
);
271 * numa_reset_distance - Reset NUMA distance table
273 * The current table is freed. The next numa_set_distance() call will
276 void __init
numa_reset_distance(void)
278 size_t size
= numa_distance_cnt
* numa_distance_cnt
* sizeof(numa_distance
[0]);
280 /* numa_distance could be 1LU marking allocation failure, test cnt */
281 if (numa_distance_cnt
)
282 memblock_x86_free_range(__pa(numa_distance
),
283 __pa(numa_distance
) + size
);
284 numa_distance_cnt
= 0;
285 numa_distance
= NULL
; /* enable table creation */
288 static int __init
numa_alloc_distance(void)
290 nodemask_t nodes_parsed
;
295 /* size the new table and allocate it */
296 nodes_parsed
= numa_nodes_parsed
;
297 numa_nodemask_from_meminfo(&nodes_parsed
, &numa_meminfo
);
299 for_each_node_mask(i
, nodes_parsed
)
302 size
= cnt
* cnt
* sizeof(numa_distance
[0]);
304 phys
= memblock_find_in_range(0, (u64
)max_pfn_mapped
<< PAGE_SHIFT
,
306 if (phys
== MEMBLOCK_ERROR
) {
307 pr_warning("NUMA: Warning: can't allocate distance table!\n");
308 /* don't retry until explicitly reset */
309 numa_distance
= (void *)1LU;
312 memblock_x86_reserve_range(phys
, phys
+ size
, "NUMA DIST");
314 numa_distance
= __va(phys
);
315 numa_distance_cnt
= cnt
;
317 /* fill with the default distances */
318 for (i
= 0; i
< cnt
; i
++)
319 for (j
= 0; j
< cnt
; j
++)
320 numa_distance
[i
* cnt
+ j
] = i
== j
?
321 LOCAL_DISTANCE
: REMOTE_DISTANCE
;
322 printk(KERN_DEBUG
"NUMA: Initialized distance table, cnt=%d\n", cnt
);
328 * numa_set_distance - Set NUMA distance from one NUMA to another
329 * @from: the 'from' node to set distance
330 * @to: the 'to' node to set distance
331 * @distance: NUMA distance
333 * Set the distance from node @from to @to to @distance. If distance table
334 * doesn't exist, one which is large enough to accommodate all the currently
335 * known nodes will be created.
337 * If such table cannot be allocated, a warning is printed and further
338 * calls are ignored until the distance table is reset with
339 * numa_reset_distance().
341 * If @from or @to is higher than the highest known node at the time of
342 * table creation or @distance doesn't make sense, the call is ignored.
343 * This is to allow simplification of specific NUMA config implementations.
345 void __init
numa_set_distance(int from
, int to
, int distance
)
347 if (!numa_distance
&& numa_alloc_distance() < 0)
350 if (from
>= numa_distance_cnt
|| to
>= numa_distance_cnt
) {
351 printk_once(KERN_DEBUG
"NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
356 if ((u8
)distance
!= distance
||
357 (from
== to
&& distance
!= LOCAL_DISTANCE
)) {
358 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
363 numa_distance
[from
* numa_distance_cnt
+ to
] = distance
;
366 int __node_distance(int from
, int to
)
368 if (from
>= numa_distance_cnt
|| to
>= numa_distance_cnt
)
369 return from
== to
? LOCAL_DISTANCE
: REMOTE_DISTANCE
;
370 return numa_distance
[from
* numa_distance_cnt
+ to
];
372 EXPORT_SYMBOL(__node_distance
);
375 * Sanity check to catch more bad NUMA configurations (they are amazingly
376 * common). Make sure the nodes cover all memory.
378 static bool __init
numa_meminfo_cover_memory(const struct numa_meminfo
*mi
)
380 unsigned long numaram
, e820ram
;
384 for (i
= 0; i
< mi
->nr_blks
; i
++) {
385 unsigned long s
= mi
->blk
[i
].start
>> PAGE_SHIFT
;
386 unsigned long e
= mi
->blk
[i
].end
>> PAGE_SHIFT
;
388 numaram
-= __absent_pages_in_range(mi
->blk
[i
].nid
, s
, e
);
389 if ((long)numaram
< 0)
393 e820ram
= max_pfn
- (memblock_x86_hole_size(0,
394 max_pfn
<< PAGE_SHIFT
) >> PAGE_SHIFT
);
395 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
396 if ((long)(e820ram
- numaram
) >= (1 << (20 - PAGE_SHIFT
))) {
397 printk(KERN_ERR
"NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
398 (numaram
<< PAGE_SHIFT
) >> 20,
399 (e820ram
<< PAGE_SHIFT
) >> 20);
405 static int __init
numa_register_memblks(struct numa_meminfo
*mi
)
409 /* Account for nodes with cpus and no memory */
410 node_possible_map
= numa_nodes_parsed
;
411 numa_nodemask_from_meminfo(&node_possible_map
, mi
);
412 if (WARN_ON(nodes_empty(node_possible_map
)))
415 for (i
= 0; i
< mi
->nr_blks
; i
++)
416 memblock_x86_register_active_regions(mi
->blk
[i
].nid
,
417 mi
->blk
[i
].start
>> PAGE_SHIFT
,
418 mi
->blk
[i
].end
>> PAGE_SHIFT
);
420 /* for out of order entries */
422 if (!numa_meminfo_cover_memory(mi
))
425 /* Finally register nodes. */
426 for_each_node_mask(nid
, node_possible_map
) {
427 u64 start
= (u64
)max_pfn
<< PAGE_SHIFT
;
430 for (i
= 0; i
< mi
->nr_blks
; i
++) {
431 if (nid
!= mi
->blk
[i
].nid
)
433 start
= min(mi
->blk
[i
].start
, start
);
434 end
= max(mi
->blk
[i
].end
, end
);
438 setup_node_bootmem(nid
, start
, end
);
445 * dummy_numma_init - Fallback dummy NUMA init
447 * Used if there's no underlying NUMA architecture, NUMA initialization
448 * fails, or NUMA is disabled on the command line.
450 * Must online at least one node and add memory blocks that cover all
451 * allowed memory. This function must not fail.
453 static int __init
dummy_numa_init(void)
455 printk(KERN_INFO
"%s\n",
456 numa_off
? "NUMA turned off" : "No NUMA configuration found");
457 printk(KERN_INFO
"Faking a node at %016lx-%016lx\n",
458 0LU, max_pfn
<< PAGE_SHIFT
);
460 node_set(0, numa_nodes_parsed
);
461 numa_add_memblk(0, 0, (u64
)max_pfn
<< PAGE_SHIFT
);
466 static int __init
numa_init(int (*init_func
)(void))
471 for (i
= 0; i
< MAX_LOCAL_APIC
; i
++)
472 set_apicid_to_node(i
, NUMA_NO_NODE
);
474 nodes_clear(numa_nodes_parsed
);
475 nodes_clear(node_possible_map
);
476 nodes_clear(node_online_map
);
477 memset(&numa_meminfo
, 0, sizeof(numa_meminfo
));
478 remove_all_active_ranges();
479 numa_reset_distance();
484 ret
= numa_cleanup_meminfo(&numa_meminfo
);
488 numa_emulation(&numa_meminfo
, numa_distance_cnt
);
490 ret
= numa_register_memblks(&numa_meminfo
);
494 for (i
= 0; i
< nr_cpu_ids
; i
++) {
495 int nid
= early_cpu_to_node(i
);
497 if (nid
== NUMA_NO_NODE
)
499 if (!node_online(nid
))
506 void __init
initmem_init(void)
511 #ifdef CONFIG_ACPI_NUMA
512 ret
= numa_init(x86_acpi_numa_init
);
516 #ifdef CONFIG_AMD_NUMA
517 ret
= numa_init(amd_numa_init
);
523 numa_init(dummy_numa_init
);
526 unsigned long __init
numa_free_all_bootmem(void)
528 unsigned long pages
= 0;
531 for_each_online_node(i
)
532 pages
+= free_all_bootmem_node(NODE_DATA(i
));
534 pages
+= free_all_memory_core_early(MAX_NUMNODES
);
539 int __cpuinit
numa_cpu_node(int cpu
)
541 int apicid
= early_per_cpu(x86_cpu_to_apicid
, cpu
);
543 if (apicid
!= BAD_APICID
)
544 return __apicid_to_node
[apicid
];