Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * arch/ia64/kernel/domain.c | |
3 | * Architecture specific sched-domains builder. | |
4 | * | |
5 | * Copyright (C) 2004 Jesse Barnes | |
6 | * Copyright (C) 2004 Silicon Graphics, Inc. | |
7 | */ | |
8 | ||
9 | #include <linux/sched.h> | |
10 | #include <linux/percpu.h> | |
11 | #include <linux/slab.h> | |
12 | #include <linux/cpumask.h> | |
13 | #include <linux/init.h> | |
14 | #include <linux/topology.h> | |
15 | #include <linux/nodemask.h> | |
16 | ||
687f1661 | 17 | #define SD_NODES_PER_DOMAIN 16 |
1da177e4 LT |
18 | |
19 | #ifdef CONFIG_NUMA | |
20 | /** | |
21 | * find_next_best_node - find the next node to include in a sched_domain | |
22 | * @node: node whose sched_domain we're building | |
23 | * @used_nodes: nodes already in the sched_domain | |
24 | * | |
25 | * Find the next node to include in a given scheduling domain. Simply | |
26 | * finds the closest node not already in the @used_nodes map. | |
27 | * | |
28 | * Should use nodemask_t. | |
29 | */ | |
7f1867a5 | 30 | static int find_next_best_node(int node, unsigned long *used_nodes) |
1da177e4 LT |
31 | { |
32 | int i, n, val, min_val, best_node = 0; | |
33 | ||
34 | min_val = INT_MAX; | |
35 | ||
36 | for (i = 0; i < MAX_NUMNODES; i++) { | |
37 | /* Start at @node */ | |
38 | n = (node + i) % MAX_NUMNODES; | |
39 | ||
40 | if (!nr_cpus_node(n)) | |
41 | continue; | |
42 | ||
43 | /* Skip already used nodes */ | |
44 | if (test_bit(n, used_nodes)) | |
45 | continue; | |
46 | ||
47 | /* Simple min distance search */ | |
48 | val = node_distance(node, n); | |
49 | ||
50 | if (val < min_val) { | |
51 | min_val = val; | |
52 | best_node = n; | |
53 | } | |
54 | } | |
55 | ||
56 | set_bit(best_node, used_nodes); | |
57 | return best_node; | |
58 | } | |
59 | ||
60 | /** | |
61 | * sched_domain_node_span - get a cpumask for a node's sched_domain | |
62 | * @node: node whose cpumask we're constructing | |
63 | * @size: number of nodes to include in this span | |
64 | * | |
65 | * Given a node, construct a good cpumask for its sched_domain to span. It | |
66 | * should be one that prevents unnecessary balancing, but also spreads tasks | |
67 | * out optimally. | |
68 | */ | |
7f1867a5 | 69 | static cpumask_t sched_domain_node_span(int node) |
1da177e4 LT |
70 | { |
71 | int i; | |
72 | cpumask_t span, nodemask; | |
73 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); | |
74 | ||
75 | cpus_clear(span); | |
76 | bitmap_zero(used_nodes, MAX_NUMNODES); | |
77 | ||
78 | nodemask = node_to_cpumask(node); | |
79 | cpus_or(span, span, nodemask); | |
80 | set_bit(node, used_nodes); | |
81 | ||
82 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | |
83 | int next_node = find_next_best_node(node, used_nodes); | |
84 | nodemask = node_to_cpumask(next_node); | |
85 | cpus_or(span, span, nodemask); | |
86 | } | |
87 | ||
88 | return span; | |
89 | } | |
90 | #endif | |
91 | ||
92 | /* | |
93 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we | |
94 | * can switch it on easily if needed. | |
95 | */ | |
96 | #ifdef CONFIG_SCHED_SMT | |
97 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | |
98 | static struct sched_group sched_group_cpus[NR_CPUS]; | |
7f1867a5 | 99 | static int cpu_to_cpu_group(int cpu) |
1da177e4 LT |
100 | { |
101 | return cpu; | |
102 | } | |
103 | #endif | |
104 | ||
105 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | |
106 | static struct sched_group sched_group_phys[NR_CPUS]; | |
7f1867a5 | 107 | static int cpu_to_phys_group(int cpu) |
1da177e4 LT |
108 | { |
109 | #ifdef CONFIG_SCHED_SMT | |
110 | return first_cpu(cpu_sibling_map[cpu]); | |
111 | #else | |
112 | return cpu; | |
113 | #endif | |
114 | } | |
115 | ||
116 | #ifdef CONFIG_NUMA | |
117 | /* | |
118 | * The init_sched_build_groups can't handle what we want to do with node | |
119 | * groups, so roll our own. Now each node has its own list of groups which | |
120 | * gets dynamically allocated. | |
121 | */ | |
122 | static DEFINE_PER_CPU(struct sched_domain, node_domains); | |
123 | static struct sched_group *sched_group_nodes[MAX_NUMNODES]; | |
124 | ||
125 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); | |
126 | static struct sched_group sched_group_allnodes[MAX_NUMNODES]; | |
127 | ||
7f1867a5 | 128 | static int cpu_to_allnodes_group(int cpu) |
1da177e4 LT |
129 | { |
130 | return cpu_to_node(cpu); | |
131 | } | |
132 | #endif | |
133 | ||
134 | /* | |
7f1867a5 DG |
135 | * Build sched domains for a given set of cpus and attach the sched domains |
136 | * to the individual cpus | |
1da177e4 | 137 | */ |
7f1867a5 | 138 | void build_sched_domains(const cpumask_t *cpu_map) |
1da177e4 LT |
139 | { |
140 | int i; | |
1da177e4 LT |
141 | |
142 | /* | |
7f1867a5 | 143 | * Set up domains for cpus specified by the cpu_map. |
1da177e4 | 144 | */ |
7f1867a5 | 145 | for_each_cpu_mask(i, *cpu_map) { |
1da177e4 LT |
146 | int group; |
147 | struct sched_domain *sd = NULL, *p; | |
148 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); | |
149 | ||
7f1867a5 | 150 | cpus_and(nodemask, nodemask, *cpu_map); |
1da177e4 LT |
151 | |
152 | #ifdef CONFIG_NUMA | |
153 | if (num_online_cpus() | |
154 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | |
155 | sd = &per_cpu(allnodes_domains, i); | |
156 | *sd = SD_ALLNODES_INIT; | |
7f1867a5 | 157 | sd->span = *cpu_map; |
1da177e4 LT |
158 | group = cpu_to_allnodes_group(i); |
159 | sd->groups = &sched_group_allnodes[group]; | |
160 | p = sd; | |
161 | } else | |
162 | p = NULL; | |
163 | ||
164 | sd = &per_cpu(node_domains, i); | |
165 | *sd = SD_NODE_INIT; | |
166 | sd->span = sched_domain_node_span(cpu_to_node(i)); | |
167 | sd->parent = p; | |
7f1867a5 | 168 | cpus_and(sd->span, sd->span, *cpu_map); |
1da177e4 LT |
169 | #endif |
170 | ||
171 | p = sd; | |
172 | sd = &per_cpu(phys_domains, i); | |
173 | group = cpu_to_phys_group(i); | |
174 | *sd = SD_CPU_INIT; | |
175 | sd->span = nodemask; | |
176 | sd->parent = p; | |
177 | sd->groups = &sched_group_phys[group]; | |
178 | ||
179 | #ifdef CONFIG_SCHED_SMT | |
180 | p = sd; | |
181 | sd = &per_cpu(cpu_domains, i); | |
182 | group = cpu_to_cpu_group(i); | |
183 | *sd = SD_SIBLING_INIT; | |
184 | sd->span = cpu_sibling_map[i]; | |
7f1867a5 | 185 | cpus_and(sd->span, sd->span, *cpu_map); |
1da177e4 LT |
186 | sd->parent = p; |
187 | sd->groups = &sched_group_cpus[group]; | |
188 | #endif | |
189 | } | |
190 | ||
191 | #ifdef CONFIG_SCHED_SMT | |
192 | /* Set up CPU (sibling) groups */ | |
7f1867a5 | 193 | for_each_cpu_mask(i, *cpu_map) { |
1da177e4 | 194 | cpumask_t this_sibling_map = cpu_sibling_map[i]; |
7f1867a5 | 195 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); |
1da177e4 LT |
196 | if (i != first_cpu(this_sibling_map)) |
197 | continue; | |
198 | ||
199 | init_sched_build_groups(sched_group_cpus, this_sibling_map, | |
200 | &cpu_to_cpu_group); | |
201 | } | |
202 | #endif | |
203 | ||
204 | /* Set up physical groups */ | |
205 | for (i = 0; i < MAX_NUMNODES; i++) { | |
206 | cpumask_t nodemask = node_to_cpumask(i); | |
207 | ||
7f1867a5 | 208 | cpus_and(nodemask, nodemask, *cpu_map); |
1da177e4 LT |
209 | if (cpus_empty(nodemask)) |
210 | continue; | |
211 | ||
212 | init_sched_build_groups(sched_group_phys, nodemask, | |
213 | &cpu_to_phys_group); | |
214 | } | |
215 | ||
216 | #ifdef CONFIG_NUMA | |
7f1867a5 | 217 | init_sched_build_groups(sched_group_allnodes, *cpu_map, |
1da177e4 LT |
218 | &cpu_to_allnodes_group); |
219 | ||
220 | for (i = 0; i < MAX_NUMNODES; i++) { | |
221 | /* Set up node groups */ | |
222 | struct sched_group *sg, *prev; | |
223 | cpumask_t nodemask = node_to_cpumask(i); | |
224 | cpumask_t domainspan; | |
225 | cpumask_t covered = CPU_MASK_NONE; | |
226 | int j; | |
227 | ||
7f1867a5 | 228 | cpus_and(nodemask, nodemask, *cpu_map); |
1da177e4 LT |
229 | if (cpus_empty(nodemask)) |
230 | continue; | |
231 | ||
232 | domainspan = sched_domain_node_span(i); | |
7f1867a5 | 233 | cpus_and(domainspan, domainspan, *cpu_map); |
1da177e4 LT |
234 | |
235 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | |
236 | sched_group_nodes[i] = sg; | |
237 | for_each_cpu_mask(j, nodemask) { | |
238 | struct sched_domain *sd; | |
239 | sd = &per_cpu(node_domains, j); | |
240 | sd->groups = sg; | |
241 | if (sd->groups == NULL) { | |
242 | /* Turn off balancing if we have no groups */ | |
243 | sd->flags = 0; | |
244 | } | |
245 | } | |
246 | if (!sg) { | |
247 | printk(KERN_WARNING | |
248 | "Can not alloc domain group for node %d\n", i); | |
249 | continue; | |
250 | } | |
251 | sg->cpu_power = 0; | |
252 | sg->cpumask = nodemask; | |
253 | cpus_or(covered, covered, nodemask); | |
254 | prev = sg; | |
255 | ||
256 | for (j = 0; j < MAX_NUMNODES; j++) { | |
257 | cpumask_t tmp, notcovered; | |
258 | int n = (i + j) % MAX_NUMNODES; | |
259 | ||
260 | cpus_complement(notcovered, covered); | |
7f1867a5 | 261 | cpus_and(tmp, notcovered, *cpu_map); |
1da177e4 LT |
262 | cpus_and(tmp, tmp, domainspan); |
263 | if (cpus_empty(tmp)) | |
264 | break; | |
265 | ||
266 | nodemask = node_to_cpumask(n); | |
267 | cpus_and(tmp, tmp, nodemask); | |
268 | if (cpus_empty(tmp)) | |
269 | continue; | |
270 | ||
271 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | |
272 | if (!sg) { | |
273 | printk(KERN_WARNING | |
274 | "Can not alloc domain group for node %d\n", j); | |
275 | break; | |
276 | } | |
277 | sg->cpu_power = 0; | |
278 | sg->cpumask = tmp; | |
279 | cpus_or(covered, covered, tmp); | |
280 | prev->next = sg; | |
281 | prev = sg; | |
282 | } | |
283 | prev->next = sched_group_nodes[i]; | |
284 | } | |
285 | #endif | |
286 | ||
287 | /* Calculate CPU power for physical packages and nodes */ | |
7f1867a5 | 288 | for_each_cpu_mask(i, *cpu_map) { |
1da177e4 LT |
289 | int power; |
290 | struct sched_domain *sd; | |
291 | #ifdef CONFIG_SCHED_SMT | |
292 | sd = &per_cpu(cpu_domains, i); | |
293 | power = SCHED_LOAD_SCALE; | |
294 | sd->groups->cpu_power = power; | |
295 | #endif | |
296 | ||
297 | sd = &per_cpu(phys_domains, i); | |
298 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | |
299 | (cpus_weight(sd->groups->cpumask)-1) / 10; | |
300 | sd->groups->cpu_power = power; | |
301 | ||
302 | #ifdef CONFIG_NUMA | |
303 | sd = &per_cpu(allnodes_domains, i); | |
304 | if (sd->groups) { | |
305 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | |
306 | (cpus_weight(sd->groups->cpumask)-1) / 10; | |
307 | sd->groups->cpu_power = power; | |
308 | } | |
309 | #endif | |
310 | } | |
311 | ||
312 | #ifdef CONFIG_NUMA | |
313 | for (i = 0; i < MAX_NUMNODES; i++) { | |
314 | struct sched_group *sg = sched_group_nodes[i]; | |
315 | int j; | |
316 | ||
317 | if (sg == NULL) | |
318 | continue; | |
319 | next_sg: | |
320 | for_each_cpu_mask(j, sg->cpumask) { | |
321 | struct sched_domain *sd; | |
322 | int power; | |
323 | ||
324 | sd = &per_cpu(phys_domains, j); | |
325 | if (j != first_cpu(sd->groups->cpumask)) { | |
326 | /* | |
327 | * Only add "power" once for each | |
328 | * physical package. | |
329 | */ | |
330 | continue; | |
331 | } | |
332 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | |
333 | (cpus_weight(sd->groups->cpumask)-1) / 10; | |
334 | ||
335 | sg->cpu_power += power; | |
336 | } | |
337 | sg = sg->next; | |
338 | if (sg != sched_group_nodes[i]) | |
339 | goto next_sg; | |
340 | } | |
341 | #endif | |
342 | ||
343 | /* Attach the domains */ | |
367ae3cd | 344 | for_each_cpu_mask(i, *cpu_map) { |
1da177e4 LT |
345 | struct sched_domain *sd; |
346 | #ifdef CONFIG_SCHED_SMT | |
347 | sd = &per_cpu(cpu_domains, i); | |
348 | #else | |
349 | sd = &per_cpu(phys_domains, i); | |
350 | #endif | |
351 | cpu_attach_domain(sd, i); | |
352 | } | |
353 | } | |
7f1867a5 DG |
354 | /* |
355 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | |
356 | */ | |
357 | void arch_init_sched_domains(const cpumask_t *cpu_map) | |
358 | { | |
359 | cpumask_t cpu_default_map; | |
360 | ||
361 | /* | |
362 | * Setup mask for cpus without special case scheduling requirements. | |
363 | * For now this just excludes isolated cpus, but could be used to | |
364 | * exclude other special cases in the future. | |
365 | */ | |
366 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); | |
367 | ||
368 | build_sched_domains(&cpu_default_map); | |
369 | } | |
1da177e4 | 370 | |
7f1867a5 | 371 | void arch_destroy_sched_domains(const cpumask_t *cpu_map) |
1da177e4 LT |
372 | { |
373 | #ifdef CONFIG_NUMA | |
374 | int i; | |
375 | for (i = 0; i < MAX_NUMNODES; i++) { | |
7f1867a5 | 376 | cpumask_t nodemask = node_to_cpumask(i); |
1da177e4 | 377 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; |
7f1867a5 DG |
378 | |
379 | cpus_and(nodemask, nodemask, *cpu_map); | |
380 | if (cpus_empty(nodemask)) | |
381 | continue; | |
382 | ||
1da177e4 LT |
383 | if (sg == NULL) |
384 | continue; | |
385 | sg = sg->next; | |
386 | next_sg: | |
387 | oldsg = sg; | |
388 | sg = sg->next; | |
389 | kfree(oldsg); | |
390 | if (oldsg != sched_group_nodes[i]) | |
391 | goto next_sg; | |
392 | sched_group_nodes[i] = NULL; | |
393 | } | |
394 | #endif | |
395 | } | |
396 |