[libata] build fix after cdb_len move
[deliverable/linux.git] / arch / x86_64 / mm / numa.c
1 /*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5 #include <linux/kernel.h>
6 #include <linux/mm.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
14
15 #include <asm/e820.h>
16 #include <asm/proto.h>
17 #include <asm/dma.h>
18 #include <asm/numa.h>
19 #include <asm/acpi.h>
20
21 #ifndef Dprintk
22 #define Dprintk(x...)
23 #endif
24
25 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28 int memnode_shift;
29 u8 memnodemap[NODEMAPSIZE];
30
31 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
32 [0 ... NR_CPUS-1] = NUMA_NO_NODE
33 };
34 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36 };
37 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
38
39 int numa_off __initdata;
40
41
42 /*
43 * Given a shift value, try to populate memnodemap[]
44 * Returns :
45 * 1 if OK
46 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big)
48 */
49 static int __init
50 populate_memnodemap(const struct node *nodes, int numnodes, int shift)
51 {
52 int i;
53 int res = -1;
54 unsigned long addr, end;
55
56 if (shift >= 64)
57 return -1;
58 memset(memnodemap, 0xff, sizeof(memnodemap));
59 for (i = 0; i < numnodes; i++) {
60 addr = nodes[i].start;
61 end = nodes[i].end;
62 if (addr >= end)
63 continue;
64 if ((end >> shift) >= NODEMAPSIZE)
65 return 0;
66 do {
67 if (memnodemap[addr >> shift] != 0xff)
68 return -1;
69 memnodemap[addr >> shift] = i;
70 addr += (1UL << shift);
71 } while (addr < end);
72 res = 1;
73 }
74 return res;
75 }
76
77 int __init compute_hash_shift(struct node *nodes, int numnodes)
78 {
79 int shift = 20;
80
81 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
82 shift++;
83
84 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
85 shift);
86
87 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
88 printk(KERN_INFO
89 "Your memory is not aligned you need to rebuild your kernel "
90 "with a bigger NODEMAPSIZE shift=%d\n",
91 shift);
92 return -1;
93 }
94 return shift;
95 }
96
97 #ifdef CONFIG_SPARSEMEM
98 int early_pfn_to_nid(unsigned long pfn)
99 {
100 return phys_to_nid(pfn << PAGE_SHIFT);
101 }
102 #endif
103
104 /* Initialize bootmem allocator for a node */
105 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
106 {
107 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
108 unsigned long nodedata_phys;
109 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
110
111 start = round_up(start, ZONE_ALIGN);
112
113 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
114
115 start_pfn = start >> PAGE_SHIFT;
116 end_pfn = end >> PAGE_SHIFT;
117
118 nodedata_phys = find_e820_area(start, end, pgdat_size);
119 if (nodedata_phys == -1L)
120 panic("Cannot find memory pgdat in node %d\n", nodeid);
121
122 Dprintk("nodedata_phys %lx\n", nodedata_phys);
123
124 node_data[nodeid] = phys_to_virt(nodedata_phys);
125 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
126 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
127 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
128 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
129
130 /* Find a place for the bootmem map */
131 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
132 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
133 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
134 if (bootmap_start == -1L)
135 panic("Not enough continuous space for bootmap on node %d", nodeid);
136 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
137
138 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
139 bootmap_start >> PAGE_SHIFT,
140 start_pfn, end_pfn);
141
142 e820_bootmem_free(NODE_DATA(nodeid), start, end);
143
144 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
145 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
146 node_set_online(nodeid);
147 }
148
149 /* Initialize final allocator for a zone */
150 void __init setup_node_zones(int nodeid)
151 {
152 unsigned long start_pfn, end_pfn;
153 unsigned long zones[MAX_NR_ZONES];
154 unsigned long holes[MAX_NR_ZONES];
155
156 start_pfn = node_start_pfn(nodeid);
157 end_pfn = node_end_pfn(nodeid);
158
159 Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
160 nodeid, start_pfn, end_pfn);
161
162 size_zones(zones, holes, start_pfn, end_pfn);
163 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
164 start_pfn, holes);
165 }
166
167 void __init numa_init_array(void)
168 {
169 int rr, i;
170 /* There are unfortunately some poorly designed mainboards around
171 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
172 mapping. To avoid this fill in the mapping for all possible
173 CPUs, as the number of CPUs is not known yet.
174 We round robin the existing nodes. */
175 rr = first_node(node_online_map);
176 for (i = 0; i < NR_CPUS; i++) {
177 if (cpu_to_node[i] != NUMA_NO_NODE)
178 continue;
179 numa_set_node(i, rr);
180 rr = next_node(rr, node_online_map);
181 if (rr == MAX_NUMNODES)
182 rr = first_node(node_online_map);
183 }
184
185 }
186
187 #ifdef CONFIG_NUMA_EMU
188 int numa_fake __initdata = 0;
189
190 /* Numa emulation */
191 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
192 {
193 int i;
194 struct node nodes[MAX_NUMNODES];
195 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
196
197 /* Kludge needed for the hash function */
198 if (hweight64(sz) > 1) {
199 unsigned long x = 1;
200 while ((x << 1) < sz)
201 x <<= 1;
202 if (x < sz/2)
203 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
204 sz = x;
205 }
206
207 memset(&nodes,0,sizeof(nodes));
208 for (i = 0; i < numa_fake; i++) {
209 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
210 if (i == numa_fake-1)
211 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
212 nodes[i].end = nodes[i].start + sz;
213 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
214 i,
215 nodes[i].start, nodes[i].end,
216 (nodes[i].end - nodes[i].start) >> 20);
217 node_set_online(i);
218 }
219 memnode_shift = compute_hash_shift(nodes, numa_fake);
220 if (memnode_shift < 0) {
221 memnode_shift = 0;
222 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
223 return -1;
224 }
225 for_each_online_node(i)
226 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
227 numa_init_array();
228 return 0;
229 }
230 #endif
231
232 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
233 {
234 int i;
235
236 #ifdef CONFIG_NUMA_EMU
237 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
238 return;
239 #endif
240
241 #ifdef CONFIG_ACPI_NUMA
242 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
243 end_pfn << PAGE_SHIFT))
244 return;
245 #endif
246
247 #ifdef CONFIG_K8_NUMA
248 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
249 return;
250 #endif
251 printk(KERN_INFO "%s\n",
252 numa_off ? "NUMA turned off" : "No NUMA configuration found");
253
254 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
255 start_pfn << PAGE_SHIFT,
256 end_pfn << PAGE_SHIFT);
257 /* setup dummy node covering all memory */
258 memnode_shift = 63;
259 memnodemap[0] = 0;
260 nodes_clear(node_online_map);
261 node_set_online(0);
262 for (i = 0; i < NR_CPUS; i++)
263 numa_set_node(i, 0);
264 node_to_cpumask[0] = cpumask_of_cpu(0);
265 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
266 }
267
268 __cpuinit void numa_add_cpu(int cpu)
269 {
270 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
271 }
272
273 void __cpuinit numa_set_node(int cpu, int node)
274 {
275 cpu_pda(cpu)->nodenumber = node;
276 cpu_to_node[cpu] = node;
277 }
278
279 unsigned long __init numa_free_all_bootmem(void)
280 {
281 int i;
282 unsigned long pages = 0;
283 for_each_online_node(i) {
284 pages += free_all_bootmem_node(NODE_DATA(i));
285 }
286 return pages;
287 }
288
289 #ifdef CONFIG_SPARSEMEM
290 static void __init arch_sparse_init(void)
291 {
292 int i;
293
294 for_each_online_node(i)
295 memory_present(i, node_start_pfn(i), node_end_pfn(i));
296
297 sparse_init();
298 }
299 #else
300 #define arch_sparse_init() do {} while (0)
301 #endif
302
303 void __init paging_init(void)
304 {
305 int i;
306
307 arch_sparse_init();
308
309 for_each_online_node(i) {
310 setup_node_zones(i);
311 }
312 }
313
314 /* [numa=off] */
315 __init int numa_setup(char *opt)
316 {
317 if (!strncmp(opt,"off",3))
318 numa_off = 1;
319 #ifdef CONFIG_NUMA_EMU
320 if(!strncmp(opt, "fake=", 5)) {
321 numa_fake = simple_strtoul(opt+5,NULL,0); ;
322 if (numa_fake >= MAX_NUMNODES)
323 numa_fake = MAX_NUMNODES;
324 }
325 #endif
326 #ifdef CONFIG_ACPI_NUMA
327 if (!strncmp(opt,"noacpi",6))
328 acpi_numa = -1;
329 #endif
330 return 1;
331 }
332
333 /*
334 * Setup early cpu_to_node.
335 *
336 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
337 * and apicid_to_node[] tables have valid entries for a CPU.
338 * This means we skip cpu_to_node[] initialisation for NUMA
339 * emulation and faking node case (when running a kernel compiled
340 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
341 * is already initialized in a round robin manner at numa_init_array,
342 * prior to this call, and this initialization is good enough
343 * for the fake NUMA cases.
344 */
345 void __init init_cpu_to_node(void)
346 {
347 int i;
348 for (i = 0; i < NR_CPUS; i++) {
349 u8 apicid = x86_cpu_to_apicid[i];
350 if (apicid == BAD_APICID)
351 continue;
352 if (apicid_to_node[apicid] == NUMA_NO_NODE)
353 continue;
354 cpu_to_node[i] = apicid_to_node[apicid];
355 }
356 }
357
358 EXPORT_SYMBOL(cpu_to_node);
359 EXPORT_SYMBOL(node_to_cpumask);
360 EXPORT_SYMBOL(memnode_shift);
361 EXPORT_SYMBOL(memnodemap);
362 EXPORT_SYMBOL(node_data);
363
364 #ifdef CONFIG_DISCONTIGMEM
365 /*
366 * Functions to convert PFNs from/to per node page addresses.
367 * These are out of line because they are quite big.
368 * They could be all tuned by pre caching more state.
369 * Should do that.
370 */
371
372 /* Requires pfn_valid(pfn) to be true */
373 struct page *pfn_to_page(unsigned long pfn)
374 {
375 int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT);
376 return (pfn - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map;
377 }
378 EXPORT_SYMBOL(pfn_to_page);
379
380 unsigned long page_to_pfn(struct page *page)
381 {
382 return (long)(((page) - page_zone(page)->zone_mem_map) +
383 page_zone(page)->zone_start_pfn);
384 }
385 EXPORT_SYMBOL(page_to_pfn);
386
387 int pfn_valid(unsigned long pfn)
388 {
389 unsigned nid;
390 if (pfn >= num_physpages)
391 return 0;
392 nid = pfn_to_nid(pfn);
393 if (nid == 0xff)
394 return 0;
395 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
396 }
397 EXPORT_SYMBOL(pfn_valid);
398 #endif
This page took 0.046938 seconds and 5 git commands to generate.