Commit | Line | Data |
---|---|---|
c29a7baf MH |
1 | /* |
2 | * NUMA support for s390 | |
3 | * | |
4 | * NUMA emulation (aka fake NUMA) distributes the available memory to nodes | |
5 | * without using real topology information about the physical memory of the | |
6 | * machine. | |
7 | * | |
8 | * It distributes the available CPUs to nodes while respecting the original | |
9 | * machine topology information. This is done by trying to avoid to separate | |
10 | * CPUs which reside on the same book or even on the same MC. | |
11 | * | |
12 | * Because the current Linux scheduler code requires a stable cpu to node | |
13 | * mapping, cores are pinned to nodes when the first CPU thread is set online. | |
14 | * | |
15 | * Copyright IBM Corp. 2015 | |
16 | */ | |
17 | ||
18 | #define KMSG_COMPONENT "numa_emu" | |
19 | #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt | |
20 | ||
21 | #include <linux/kernel.h> | |
22 | #include <linux/cpumask.h> | |
23 | #include <linux/memblock.h> | |
24 | #include <linux/node.h> | |
25 | #include <linux/memory.h> | |
7cde4910 | 26 | #include <linux/slab.h> |
c29a7baf MH |
27 | #include <asm/smp.h> |
28 | #include <asm/topology.h> | |
29 | #include "numa_mode.h" | |
30 | #include "toptree.h" | |
31 | ||
32 | /* Distances between the different system components */ | |
33 | #define DIST_EMPTY 0 | |
34 | #define DIST_CORE 1 | |
35 | #define DIST_MC 2 | |
36 | #define DIST_BOOK 3 | |
37 | #define DIST_MAX 4 | |
38 | ||
39 | /* Node distance reported to common code */ | |
40 | #define EMU_NODE_DIST 10 | |
41 | ||
42 | /* Node ID for free (not yet pinned) cores */ | |
43 | #define NODE_ID_FREE -1 | |
44 | ||
45 | /* Different levels of toptree */ | |
46 | enum toptree_level {CORE, MC, BOOK, NODE, TOPOLOGY}; | |
47 | ||
48 | /* The two toptree IDs */ | |
49 | enum {TOPTREE_ID_PHYS, TOPTREE_ID_NUMA}; | |
50 | ||
51 | /* Number of NUMA nodes */ | |
52 | static int emu_nodes = 1; | |
53 | /* NUMA stripe size */ | |
54 | static unsigned long emu_size; | |
3a3814c2 MH |
55 | |
56 | /* | |
57 | * Node to core pinning information updates are protected by | |
58 | * "sched_domains_mutex". | |
59 | */ | |
7cde4910 MH |
60 | static struct { |
61 | s32 to_node_id[CONFIG_NR_CPUS]; /* Pinned core to node mapping */ | |
62 | int total; /* Total number of pinned cores */ | |
63 | int per_node_target; /* Cores per node without extra cores */ | |
64 | int per_node[MAX_NUMNODES]; /* Number of cores pinned to node */ | |
65 | } *emu_cores; | |
c29a7baf MH |
66 | |
67 | /* | |
68 | * Pin a core to a node | |
69 | */ | |
70 | static void pin_core_to_node(int core_id, int node_id) | |
71 | { | |
7cde4910 MH |
72 | if (emu_cores->to_node_id[core_id] == NODE_ID_FREE) { |
73 | emu_cores->per_node[node_id]++; | |
74 | emu_cores->to_node_id[core_id] = node_id; | |
75 | emu_cores->total++; | |
c29a7baf | 76 | } else { |
7cde4910 | 77 | WARN_ON(emu_cores->to_node_id[core_id] != node_id); |
c29a7baf MH |
78 | } |
79 | } | |
80 | ||
81 | /* | |
82 | * Number of pinned cores of a node | |
83 | */ | |
84 | static int cores_pinned(struct toptree *node) | |
85 | { | |
7cde4910 | 86 | return emu_cores->per_node[node->id]; |
c29a7baf MH |
87 | } |
88 | ||
89 | /* | |
90 | * ID of the node where the core is pinned (or NODE_ID_FREE) | |
91 | */ | |
92 | static int core_pinned_to_node_id(struct toptree *core) | |
93 | { | |
7cde4910 | 94 | return emu_cores->to_node_id[core->id]; |
c29a7baf MH |
95 | } |
96 | ||
97 | /* | |
98 | * Number of cores in the tree that are not yet pinned | |
99 | */ | |
100 | static int cores_free(struct toptree *tree) | |
101 | { | |
102 | struct toptree *core; | |
103 | int count = 0; | |
104 | ||
105 | toptree_for_each(core, tree, CORE) { | |
106 | if (core_pinned_to_node_id(core) == NODE_ID_FREE) | |
107 | count++; | |
108 | } | |
109 | return count; | |
110 | } | |
111 | ||
112 | /* | |
113 | * Return node of core | |
114 | */ | |
115 | static struct toptree *core_node(struct toptree *core) | |
116 | { | |
117 | return core->parent->parent->parent; | |
118 | } | |
119 | ||
120 | /* | |
121 | * Return book of core | |
122 | */ | |
123 | static struct toptree *core_book(struct toptree *core) | |
124 | { | |
125 | return core->parent->parent; | |
126 | } | |
127 | ||
128 | /* | |
129 | * Return mc of core | |
130 | */ | |
131 | static struct toptree *core_mc(struct toptree *core) | |
132 | { | |
133 | return core->parent; | |
134 | } | |
135 | ||
136 | /* | |
137 | * Distance between two cores | |
138 | */ | |
139 | static int dist_core_to_core(struct toptree *core1, struct toptree *core2) | |
140 | { | |
141 | if (core_book(core1)->id != core_book(core2)->id) | |
142 | return DIST_BOOK; | |
143 | if (core_mc(core1)->id != core_mc(core2)->id) | |
144 | return DIST_MC; | |
145 | /* Same core or sibling on same MC */ | |
146 | return DIST_CORE; | |
147 | } | |
148 | ||
149 | /* | |
150 | * Distance of a node to a core | |
151 | */ | |
152 | static int dist_node_to_core(struct toptree *node, struct toptree *core) | |
153 | { | |
154 | struct toptree *core_node; | |
155 | int dist_min = DIST_MAX; | |
156 | ||
157 | toptree_for_each(core_node, node, CORE) | |
158 | dist_min = min(dist_min, dist_core_to_core(core_node, core)); | |
159 | return dist_min == DIST_MAX ? DIST_EMPTY : dist_min; | |
160 | } | |
161 | ||
162 | /* | |
163 | * Unify will delete empty nodes, therefore recreate nodes. | |
164 | */ | |
165 | static void toptree_unify_tree(struct toptree *tree) | |
166 | { | |
167 | int nid; | |
168 | ||
169 | toptree_unify(tree); | |
170 | for (nid = 0; nid < emu_nodes; nid++) | |
171 | toptree_get_child(tree, nid); | |
172 | } | |
173 | ||
174 | /* | |
175 | * Find the best/nearest node for a given core and ensure that no node | |
7cde4910 | 176 | * gets more than "emu_cores->per_node_target + extra" cores. |
c29a7baf MH |
177 | */ |
178 | static struct toptree *node_for_core(struct toptree *numa, struct toptree *core, | |
179 | int extra) | |
180 | { | |
181 | struct toptree *node, *node_best = NULL; | |
7cde4910 | 182 | int dist_cur, dist_best, cores_target; |
c29a7baf | 183 | |
7cde4910 | 184 | cores_target = emu_cores->per_node_target + extra; |
c29a7baf MH |
185 | dist_best = DIST_MAX; |
186 | node_best = NULL; | |
187 | toptree_for_each(node, numa, NODE) { | |
188 | /* Already pinned cores must use their nodes */ | |
189 | if (core_pinned_to_node_id(core) == node->id) { | |
190 | node_best = node; | |
191 | break; | |
192 | } | |
193 | /* Skip nodes that already have enough cores */ | |
7cde4910 | 194 | if (cores_pinned(node) >= cores_target) |
c29a7baf MH |
195 | continue; |
196 | dist_cur = dist_node_to_core(node, core); | |
197 | if (dist_cur < dist_best) { | |
198 | dist_best = dist_cur; | |
199 | node_best = node; | |
200 | } | |
201 | } | |
202 | return node_best; | |
203 | } | |
204 | ||
205 | /* | |
206 | * Find the best node for each core with respect to "extra" core count | |
207 | */ | |
208 | static void toptree_to_numa_single(struct toptree *numa, struct toptree *phys, | |
209 | int extra) | |
210 | { | |
211 | struct toptree *node, *core, *tmp; | |
212 | ||
213 | toptree_for_each_safe(core, tmp, phys, CORE) { | |
214 | node = node_for_core(numa, core, extra); | |
215 | if (!node) | |
216 | return; | |
217 | toptree_move(core, node); | |
218 | pin_core_to_node(core->id, node->id); | |
219 | } | |
220 | } | |
221 | ||
222 | /* | |
223 | * Move structures of given level to specified NUMA node | |
224 | */ | |
225 | static void move_level_to_numa_node(struct toptree *node, struct toptree *phys, | |
226 | enum toptree_level level, bool perfect) | |
227 | { | |
7cde4910 | 228 | int cores_free, cores_target = emu_cores->per_node_target; |
c29a7baf | 229 | struct toptree *cur, *tmp; |
c29a7baf MH |
230 | |
231 | toptree_for_each_safe(cur, tmp, phys, level) { | |
7cde4910 | 232 | cores_free = cores_target - toptree_count(node, CORE); |
c29a7baf MH |
233 | if (perfect) { |
234 | if (cores_free == toptree_count(cur, CORE)) | |
235 | toptree_move(cur, node); | |
236 | } else { | |
237 | if (cores_free >= toptree_count(cur, CORE)) | |
238 | toptree_move(cur, node); | |
239 | } | |
240 | } | |
241 | } | |
242 | ||
243 | /* | |
244 | * Move structures of a given level to NUMA nodes. If "perfect" is specified | |
245 | * move only perfectly fitting structures. Otherwise move also smaller | |
246 | * than needed structures. | |
247 | */ | |
248 | static void move_level_to_numa(struct toptree *numa, struct toptree *phys, | |
249 | enum toptree_level level, bool perfect) | |
250 | { | |
251 | struct toptree *node; | |
252 | ||
253 | toptree_for_each(node, numa, NODE) | |
254 | move_level_to_numa_node(node, phys, level, perfect); | |
255 | } | |
256 | ||
257 | /* | |
258 | * For the first run try to move the big structures | |
259 | */ | |
260 | static void toptree_to_numa_first(struct toptree *numa, struct toptree *phys) | |
261 | { | |
262 | struct toptree *core; | |
263 | ||
264 | /* Always try to move perfectly fitting structures first */ | |
265 | move_level_to_numa(numa, phys, BOOK, true); | |
266 | move_level_to_numa(numa, phys, BOOK, false); | |
267 | move_level_to_numa(numa, phys, MC, true); | |
268 | move_level_to_numa(numa, phys, MC, false); | |
269 | /* Now pin all the moved cores */ | |
270 | toptree_for_each(core, numa, CORE) | |
271 | pin_core_to_node(core->id, core_node(core)->id); | |
272 | } | |
273 | ||
274 | /* | |
275 | * Allocate new topology and create required nodes | |
276 | */ | |
277 | static struct toptree *toptree_new(int id, int nodes) | |
278 | { | |
279 | struct toptree *tree; | |
280 | int nid; | |
281 | ||
282 | tree = toptree_alloc(TOPOLOGY, id); | |
283 | if (!tree) | |
284 | goto fail; | |
285 | for (nid = 0; nid < nodes; nid++) { | |
286 | if (!toptree_get_child(tree, nid)) | |
287 | goto fail; | |
288 | } | |
289 | return tree; | |
290 | fail: | |
291 | panic("NUMA emulation could not allocate topology"); | |
292 | } | |
293 | ||
7cde4910 MH |
294 | /* |
295 | * Allocate and initialize core to node mapping | |
296 | */ | |
297 | static void create_core_to_node_map(void) | |
298 | { | |
299 | int i; | |
300 | ||
301 | emu_cores = kzalloc(sizeof(*emu_cores), GFP_KERNEL); | |
302 | if (emu_cores == NULL) | |
303 | panic("Could not allocate cores to node memory"); | |
304 | for (i = 0; i < ARRAY_SIZE(emu_cores->to_node_id); i++) | |
305 | emu_cores->to_node_id[i] = NODE_ID_FREE; | |
306 | } | |
307 | ||
c29a7baf MH |
308 | /* |
309 | * Move cores from physical topology into NUMA target topology | |
310 | * and try to keep as much of the physical topology as possible. | |
311 | */ | |
312 | static struct toptree *toptree_to_numa(struct toptree *phys) | |
313 | { | |
314 | static int first = 1; | |
315 | struct toptree *numa; | |
7cde4910 | 316 | int cores_total; |
c29a7baf | 317 | |
7cde4910 MH |
318 | cores_total = emu_cores->total + cores_free(phys); |
319 | emu_cores->per_node_target = cores_total / emu_nodes; | |
c29a7baf MH |
320 | numa = toptree_new(TOPTREE_ID_NUMA, emu_nodes); |
321 | if (first) { | |
322 | toptree_to_numa_first(numa, phys); | |
323 | first = 0; | |
324 | } | |
325 | toptree_to_numa_single(numa, phys, 0); | |
326 | toptree_to_numa_single(numa, phys, 1); | |
327 | toptree_unify_tree(numa); | |
328 | ||
329 | WARN_ON(cpumask_weight(&phys->mask)); | |
330 | return numa; | |
331 | } | |
332 | ||
333 | /* | |
334 | * Create a toptree out of the physical topology that we got from the hypervisor | |
335 | */ | |
336 | static struct toptree *toptree_from_topology(void) | |
337 | { | |
338 | struct toptree *phys, *node, *book, *mc, *core; | |
339 | struct cpu_topology_s390 *top; | |
340 | int cpu; | |
341 | ||
342 | phys = toptree_new(TOPTREE_ID_PHYS, 1); | |
343 | ||
344 | for_each_online_cpu(cpu) { | |
345 | top = &per_cpu(cpu_topology, cpu); | |
346 | node = toptree_get_child(phys, 0); | |
347 | book = toptree_get_child(node, top->book_id); | |
348 | mc = toptree_get_child(book, top->socket_id); | |
349 | core = toptree_get_child(mc, top->core_id); | |
350 | if (!book || !mc || !core) | |
351 | panic("NUMA emulation could not allocate memory"); | |
352 | cpumask_set_cpu(cpu, &core->mask); | |
353 | toptree_update_mask(mc); | |
354 | } | |
355 | return phys; | |
356 | } | |
357 | ||
358 | /* | |
359 | * Add toptree core to topology and create correct CPU masks | |
360 | */ | |
361 | static void topology_add_core(struct toptree *core) | |
362 | { | |
363 | struct cpu_topology_s390 *top; | |
364 | int cpu; | |
365 | ||
366 | for_each_cpu(cpu, &core->mask) { | |
367 | top = &per_cpu(cpu_topology, cpu); | |
368 | cpumask_copy(&top->thread_mask, &core->mask); | |
369 | cpumask_copy(&top->core_mask, &core_mc(core)->mask); | |
370 | cpumask_copy(&top->book_mask, &core_book(core)->mask); | |
22be9cd9 | 371 | cpumask_set_cpu(cpu, &node_to_cpumask_map[core_node(core)->id]); |
c29a7baf MH |
372 | top->node_id = core_node(core)->id; |
373 | } | |
374 | } | |
375 | ||
376 | /* | |
377 | * Apply toptree to topology and create CPU masks | |
378 | */ | |
379 | static void toptree_to_topology(struct toptree *numa) | |
380 | { | |
381 | struct toptree *core; | |
382 | int i; | |
383 | ||
384 | /* Clear all node masks */ | |
385 | for (i = 0; i < MAX_NUMNODES; i++) | |
22be9cd9 | 386 | cpumask_clear(&node_to_cpumask_map[i]); |
c29a7baf MH |
387 | |
388 | /* Rebuild all masks */ | |
389 | toptree_for_each(core, numa, CORE) | |
390 | topology_add_core(core); | |
391 | } | |
392 | ||
393 | /* | |
394 | * Show the node to core mapping | |
395 | */ | |
396 | static void print_node_to_core_map(void) | |
397 | { | |
398 | int nid, cid; | |
399 | ||
400 | if (!numa_debug_enabled) | |
401 | return; | |
402 | printk(KERN_DEBUG "NUMA node to core mapping\n"); | |
403 | for (nid = 0; nid < emu_nodes; nid++) { | |
404 | printk(KERN_DEBUG " node %3d: ", nid); | |
7cde4910 MH |
405 | for (cid = 0; cid < ARRAY_SIZE(emu_cores->to_node_id); cid++) { |
406 | if (emu_cores->to_node_id[cid] == nid) | |
c29a7baf MH |
407 | printk(KERN_CONT "%d ", cid); |
408 | } | |
409 | printk(KERN_CONT "\n"); | |
410 | } | |
411 | } | |
412 | ||
413 | /* | |
414 | * Transfer physical topology into a NUMA topology and modify CPU masks | |
415 | * according to the NUMA topology. | |
416 | * | |
3a3814c2 | 417 | * Must be called with "sched_domains_mutex" lock held. |
c29a7baf MH |
418 | */ |
419 | static void emu_update_cpu_topology(void) | |
420 | { | |
421 | struct toptree *phys, *numa; | |
422 | ||
7cde4910 MH |
423 | if (emu_cores == NULL) |
424 | create_core_to_node_map(); | |
c29a7baf MH |
425 | phys = toptree_from_topology(); |
426 | numa = toptree_to_numa(phys); | |
427 | toptree_free(phys); | |
428 | toptree_to_topology(numa); | |
429 | toptree_free(numa); | |
430 | print_node_to_core_map(); | |
431 | } | |
432 | ||
433 | /* | |
434 | * If emu_size is not set, use CONFIG_EMU_SIZE. Then round to minimum | |
435 | * alignment (needed for memory hotplug). | |
436 | */ | |
437 | static unsigned long emu_setup_size_adjust(unsigned long size) | |
438 | { | |
b02064a9 MH |
439 | unsigned long size_new; |
440 | ||
c29a7baf | 441 | size = size ? : CONFIG_EMU_SIZE; |
b02064a9 MH |
442 | size_new = roundup(size, memory_block_size_bytes()); |
443 | if (size_new == size) | |
444 | return size; | |
445 | pr_warn("Increasing memory stripe size from %ld MB to %ld MB\n", | |
446 | size >> 20, size_new >> 20); | |
447 | return size_new; | |
c29a7baf MH |
448 | } |
449 | ||
450 | /* | |
451 | * If we have not enough memory for the specified nodes, reduce the node count. | |
452 | */ | |
453 | static int emu_setup_nodes_adjust(int nodes) | |
454 | { | |
455 | int nodes_max; | |
456 | ||
457 | nodes_max = memblock.memory.total_size / emu_size; | |
458 | nodes_max = max(nodes_max, 1); | |
459 | if (nodes_max >= nodes) | |
460 | return nodes; | |
461 | pr_warn("Not enough memory for %d nodes, reducing node count\n", nodes); | |
462 | return nodes_max; | |
463 | } | |
464 | ||
465 | /* | |
466 | * Early emu setup | |
467 | */ | |
468 | static void emu_setup(void) | |
469 | { | |
c29a7baf MH |
470 | emu_size = emu_setup_size_adjust(emu_size); |
471 | emu_nodes = emu_setup_nodes_adjust(emu_nodes); | |
c29a7baf MH |
472 | pr_info("Creating %d nodes with memory stripe size %ld MB\n", |
473 | emu_nodes, emu_size >> 20); | |
474 | } | |
475 | ||
476 | /* | |
477 | * Return node id for given page number | |
478 | */ | |
479 | static int emu_pfn_to_nid(unsigned long pfn) | |
480 | { | |
481 | return (pfn / (emu_size >> PAGE_SHIFT)) % emu_nodes; | |
482 | } | |
483 | ||
484 | /* | |
485 | * Return stripe size | |
486 | */ | |
487 | static unsigned long emu_align(void) | |
488 | { | |
489 | return emu_size; | |
490 | } | |
491 | ||
492 | /* | |
493 | * Return distance between two nodes | |
494 | */ | |
495 | static int emu_distance(int node1, int node2) | |
496 | { | |
497 | return (node1 != node2) * EMU_NODE_DIST; | |
498 | } | |
499 | ||
500 | /* | |
501 | * Define callbacks for generic s390 NUMA infrastructure | |
502 | */ | |
503 | const struct numa_mode numa_mode_emu = { | |
504 | .name = "emu", | |
505 | .setup = emu_setup, | |
506 | .update_cpu_topology = emu_update_cpu_topology, | |
507 | .__pfn_to_nid = emu_pfn_to_nid, | |
508 | .align = emu_align, | |
509 | .distance = emu_distance, | |
510 | }; | |
511 | ||
512 | /* | |
513 | * Kernel parameter: emu_nodes=<n> | |
514 | */ | |
515 | static int __init early_parse_emu_nodes(char *p) | |
516 | { | |
517 | int count; | |
518 | ||
519 | if (kstrtoint(p, 0, &count) != 0 || count <= 0) | |
520 | return 0; | |
521 | if (count <= 0) | |
522 | return 0; | |
523 | emu_nodes = min(count, MAX_NUMNODES); | |
524 | return 0; | |
525 | } | |
526 | early_param("emu_nodes", early_parse_emu_nodes); | |
527 | ||
528 | /* | |
529 | * Kernel parameter: emu_size=[<n>[k|M|G|T]] | |
530 | */ | |
531 | static int __init early_parse_emu_size(char *p) | |
532 | { | |
533 | emu_size = memparse(p, NULL); | |
534 | return 0; | |
535 | } | |
536 | early_param("emu_size", early_parse_emu_size); |