Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; if not, write to the | |
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
19 | * Boston, MA 021110-1307, USA | |
20 | * | |
21 | * GPL HEADER END | |
22 | */ | |
23 | /* | |
24 | * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. | |
25 | * Copyright (c) 2012, Intel Corporation. | |
26 | */ | |
27 | /* | |
28 | * This file is part of Lustre, http://www.lustre.org/ | |
29 | * Lustre is a trademark of Sun Microsystems, Inc. | |
30 | * | |
31 | * Author: liang@whamcloud.com | |
32 | */ | |
33 | ||
34 | #define DEBUG_SUBSYSTEM S_LNET | |
35 | ||
36 | #include <linux/cpu.h> | |
37 | #include <linux/sched.h> | |
9fdaf8c0 | 38 | #include "../../../include/linux/libcfs/libcfs.h" |
d7e09d03 PT |
39 | |
40 | #ifdef CONFIG_SMP | |
41 | ||
42 | /** | |
43 | * modparam for setting number of partitions | |
44 | * | |
45 | * 0 : estimate best value based on cores or NUMA nodes | |
46 | * 1 : disable multiple partitions | |
47 | * >1 : specify number of partitions | |
48 | */ | |
49 | static int cpu_npartitions; | |
8cc7b4b9 PT |
50 | module_param(cpu_npartitions, int, 0444); |
51 | MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions"); | |
d7e09d03 PT |
52 | |
53 | /** | |
54 | * modparam for setting CPU partitions patterns: | |
55 | * | |
56 | * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID, | |
57 | * number in bracket is processor ID (core or HT) | |
58 | * | |
59 | * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket | |
60 | * are NUMA node ID, number before bracket is CPU partition ID. | |
61 | * | |
62 | * NB: If user specified cpu_pattern, cpu_npartitions will be ignored | |
63 | */ | |
64 | static char *cpu_pattern = ""; | |
8cc7b4b9 PT |
65 | module_param(cpu_pattern, charp, 0444); |
66 | MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern"); | |
d7e09d03 PT |
67 | |
68 | struct cfs_cpt_data { | |
69 | /* serialize hotplug etc */ | |
70 | spinlock_t cpt_lock; | |
71 | /* reserved for hotplug */ | |
72 | unsigned long cpt_version; | |
73 | /* mutex to protect cpt_cpumask */ | |
6246dab1 | 74 | struct mutex cpt_mutex; |
d7e09d03 PT |
75 | /* scratch buffer for set/unset_node */ |
76 | cpumask_t *cpt_cpumask; | |
77 | }; | |
78 | ||
79 | static struct cfs_cpt_data cpt_data; | |
80 | ||
3867ea5a | 81 | static void cfs_cpu_core_siblings(int cpu, cpumask_t *mask) |
d7e09d03 PT |
82 | { |
83 | /* return cpumask of cores in the same socket */ | |
84 | cpumask_copy(mask, topology_core_cpumask(cpu)); | |
85 | } | |
d7e09d03 PT |
86 | |
87 | /* return cpumask of HTs in the same core */ | |
3867ea5a | 88 | static void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask) |
d7e09d03 PT |
89 | { |
90 | cpumask_copy(mask, topology_thread_cpumask(cpu)); | |
91 | } | |
d7e09d03 | 92 | |
3867ea5a | 93 | static void cfs_node_to_cpumask(int node, cpumask_t *mask) |
d7e09d03 PT |
94 | { |
95 | cpumask_copy(mask, cpumask_of_node(node)); | |
96 | } | |
d7e09d03 PT |
97 | |
98 | void | |
99 | cfs_cpt_table_free(struct cfs_cpt_table *cptab) | |
100 | { | |
101 | int i; | |
102 | ||
103 | if (cptab->ctb_cpu2cpt != NULL) { | |
104 | LIBCFS_FREE(cptab->ctb_cpu2cpt, | |
105 | num_possible_cpus() * | |
106 | sizeof(cptab->ctb_cpu2cpt[0])); | |
107 | } | |
108 | ||
109 | for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) { | |
110 | struct cfs_cpu_partition *part = &cptab->ctb_parts[i]; | |
111 | ||
112 | if (part->cpt_nodemask != NULL) { | |
113 | LIBCFS_FREE(part->cpt_nodemask, | |
114 | sizeof(*part->cpt_nodemask)); | |
115 | } | |
116 | ||
117 | if (part->cpt_cpumask != NULL) | |
118 | LIBCFS_FREE(part->cpt_cpumask, cpumask_size()); | |
119 | } | |
120 | ||
121 | if (cptab->ctb_parts != NULL) { | |
122 | LIBCFS_FREE(cptab->ctb_parts, | |
123 | cptab->ctb_nparts * sizeof(cptab->ctb_parts[0])); | |
124 | } | |
125 | ||
126 | if (cptab->ctb_nodemask != NULL) | |
127 | LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask)); | |
128 | if (cptab->ctb_cpumask != NULL) | |
129 | LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size()); | |
130 | ||
131 | LIBCFS_FREE(cptab, sizeof(*cptab)); | |
132 | } | |
133 | EXPORT_SYMBOL(cfs_cpt_table_free); | |
134 | ||
135 | struct cfs_cpt_table * | |
136 | cfs_cpt_table_alloc(unsigned int ncpt) | |
137 | { | |
138 | struct cfs_cpt_table *cptab; | |
139 | int i; | |
140 | ||
141 | LIBCFS_ALLOC(cptab, sizeof(*cptab)); | |
142 | if (cptab == NULL) | |
143 | return NULL; | |
144 | ||
145 | cptab->ctb_nparts = ncpt; | |
146 | ||
147 | LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size()); | |
148 | LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask)); | |
149 | ||
150 | if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL) | |
151 | goto failed; | |
152 | ||
153 | LIBCFS_ALLOC(cptab->ctb_cpu2cpt, | |
154 | num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0])); | |
155 | if (cptab->ctb_cpu2cpt == NULL) | |
156 | goto failed; | |
157 | ||
158 | memset(cptab->ctb_cpu2cpt, -1, | |
159 | num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0])); | |
160 | ||
161 | LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0])); | |
162 | if (cptab->ctb_parts == NULL) | |
163 | goto failed; | |
164 | ||
165 | for (i = 0; i < ncpt; i++) { | |
166 | struct cfs_cpu_partition *part = &cptab->ctb_parts[i]; | |
167 | ||
168 | LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size()); | |
169 | LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask)); | |
170 | if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL) | |
171 | goto failed; | |
172 | } | |
173 | ||
174 | spin_lock(&cpt_data.cpt_lock); | |
175 | /* Reserved for hotplug */ | |
176 | cptab->ctb_version = cpt_data.cpt_version; | |
177 | spin_unlock(&cpt_data.cpt_lock); | |
178 | ||
179 | return cptab; | |
180 | ||
181 | failed: | |
182 | cfs_cpt_table_free(cptab); | |
183 | return NULL; | |
184 | } | |
185 | EXPORT_SYMBOL(cfs_cpt_table_alloc); | |
186 | ||
187 | int | |
188 | cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len) | |
189 | { | |
190 | char *tmp = buf; | |
191 | int rc = 0; | |
192 | int i; | |
193 | int j; | |
194 | ||
195 | for (i = 0; i < cptab->ctb_nparts; i++) { | |
196 | if (len > 0) { | |
197 | rc = snprintf(tmp, len, "%d\t: ", i); | |
198 | len -= rc; | |
199 | } | |
200 | ||
201 | if (len <= 0) { | |
202 | rc = -EFBIG; | |
203 | goto out; | |
204 | } | |
205 | ||
206 | tmp += rc; | |
207 | for_each_cpu_mask(j, *cptab->ctb_parts[i].cpt_cpumask) { | |
208 | rc = snprintf(tmp, len, "%d ", j); | |
209 | len -= rc; | |
210 | if (len <= 0) { | |
211 | rc = -EFBIG; | |
212 | goto out; | |
213 | } | |
214 | tmp += rc; | |
215 | } | |
216 | ||
217 | *tmp = '\n'; | |
218 | tmp++; | |
219 | len--; | |
220 | } | |
221 | ||
222 | out: | |
223 | if (rc < 0) | |
224 | return rc; | |
225 | ||
226 | return tmp - buf; | |
227 | } | |
228 | EXPORT_SYMBOL(cfs_cpt_table_print); | |
229 | ||
230 | int | |
231 | cfs_cpt_number(struct cfs_cpt_table *cptab) | |
232 | { | |
233 | return cptab->ctb_nparts; | |
234 | } | |
235 | EXPORT_SYMBOL(cfs_cpt_number); | |
236 | ||
237 | int | |
238 | cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt) | |
239 | { | |
240 | LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); | |
241 | ||
242 | return cpt == CFS_CPT_ANY ? | |
243 | cpus_weight(*cptab->ctb_cpumask) : | |
244 | cpus_weight(*cptab->ctb_parts[cpt].cpt_cpumask); | |
245 | } | |
246 | EXPORT_SYMBOL(cfs_cpt_weight); | |
247 | ||
248 | int | |
249 | cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt) | |
250 | { | |
251 | LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); | |
252 | ||
253 | return cpt == CFS_CPT_ANY ? | |
254 | any_online_cpu(*cptab->ctb_cpumask) != NR_CPUS : | |
255 | any_online_cpu(*cptab->ctb_parts[cpt].cpt_cpumask) != NR_CPUS; | |
256 | } | |
257 | EXPORT_SYMBOL(cfs_cpt_online); | |
258 | ||
259 | cpumask_t * | |
260 | cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt) | |
261 | { | |
262 | LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); | |
263 | ||
264 | return cpt == CFS_CPT_ANY ? | |
265 | cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask; | |
266 | } | |
267 | EXPORT_SYMBOL(cfs_cpt_cpumask); | |
268 | ||
269 | nodemask_t * | |
270 | cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt) | |
271 | { | |
272 | LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); | |
273 | ||
274 | return cpt == CFS_CPT_ANY ? | |
275 | cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask; | |
276 | } | |
277 | EXPORT_SYMBOL(cfs_cpt_nodemask); | |
278 | ||
279 | int | |
280 | cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) | |
281 | { | |
282 | int node; | |
283 | ||
284 | LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts); | |
285 | ||
286 | if (cpu < 0 || cpu >= NR_CPUS || !cpu_online(cpu)) { | |
287 | CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu); | |
288 | return 0; | |
289 | } | |
290 | ||
291 | if (cptab->ctb_cpu2cpt[cpu] != -1) { | |
292 | CDEBUG(D_INFO, "CPU %d is already in partition %d\n", | |
293 | cpu, cptab->ctb_cpu2cpt[cpu]); | |
294 | return 0; | |
295 | } | |
296 | ||
297 | cptab->ctb_cpu2cpt[cpu] = cpt; | |
298 | ||
299 | LASSERT(!cpu_isset(cpu, *cptab->ctb_cpumask)); | |
300 | LASSERT(!cpu_isset(cpu, *cptab->ctb_parts[cpt].cpt_cpumask)); | |
301 | ||
302 | cpu_set(cpu, *cptab->ctb_cpumask); | |
303 | cpu_set(cpu, *cptab->ctb_parts[cpt].cpt_cpumask); | |
304 | ||
305 | node = cpu_to_node(cpu); | |
306 | ||
307 | /* first CPU of @node in this CPT table */ | |
308 | if (!node_isset(node, *cptab->ctb_nodemask)) | |
309 | node_set(node, *cptab->ctb_nodemask); | |
310 | ||
311 | /* first CPU of @node in this partition */ | |
312 | if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask)) | |
313 | node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask); | |
314 | ||
315 | return 1; | |
316 | } | |
317 | EXPORT_SYMBOL(cfs_cpt_set_cpu); | |
318 | ||
319 | void | |
320 | cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) | |
321 | { | |
322 | int node; | |
323 | int i; | |
324 | ||
325 | LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); | |
326 | ||
327 | if (cpu < 0 || cpu >= NR_CPUS) { | |
328 | CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu); | |
329 | return; | |
330 | } | |
331 | ||
332 | if (cpt == CFS_CPT_ANY) { | |
333 | /* caller doesn't know the partition ID */ | |
334 | cpt = cptab->ctb_cpu2cpt[cpu]; | |
335 | if (cpt < 0) { /* not set in this CPT-table */ | |
336 | CDEBUG(D_INFO, "Try to unset cpu %d which is " | |
337 | "not in CPT-table %p\n", cpt, cptab); | |
338 | return; | |
339 | } | |
340 | ||
341 | } else if (cpt != cptab->ctb_cpu2cpt[cpu]) { | |
342 | CDEBUG(D_INFO, | |
343 | "CPU %d is not in cpu-partition %d\n", cpu, cpt); | |
344 | return; | |
345 | } | |
346 | ||
347 | LASSERT(cpu_isset(cpu, *cptab->ctb_parts[cpt].cpt_cpumask)); | |
348 | LASSERT(cpu_isset(cpu, *cptab->ctb_cpumask)); | |
349 | ||
350 | cpu_clear(cpu, *cptab->ctb_parts[cpt].cpt_cpumask); | |
351 | cpu_clear(cpu, *cptab->ctb_cpumask); | |
352 | cptab->ctb_cpu2cpt[cpu] = -1; | |
353 | ||
354 | node = cpu_to_node(cpu); | |
355 | ||
356 | LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask)); | |
357 | LASSERT(node_isset(node, *cptab->ctb_nodemask)); | |
358 | ||
359 | for_each_cpu_mask(i, *cptab->ctb_parts[cpt].cpt_cpumask) { | |
360 | /* this CPT has other CPU belonging to this node? */ | |
361 | if (cpu_to_node(i) == node) | |
362 | break; | |
363 | } | |
364 | ||
365 | if (i == NR_CPUS) | |
366 | node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask); | |
367 | ||
368 | for_each_cpu_mask(i, *cptab->ctb_cpumask) { | |
369 | /* this CPT-table has other CPU belonging to this node? */ | |
370 | if (cpu_to_node(i) == node) | |
371 | break; | |
372 | } | |
373 | ||
374 | if (i == NR_CPUS) | |
375 | node_clear(node, *cptab->ctb_nodemask); | |
376 | ||
377 | return; | |
378 | } | |
379 | EXPORT_SYMBOL(cfs_cpt_unset_cpu); | |
380 | ||
381 | int | |
382 | cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask) | |
383 | { | |
384 | int i; | |
385 | ||
386 | if (cpus_weight(*mask) == 0 || any_online_cpu(*mask) == NR_CPUS) { | |
387 | CDEBUG(D_INFO, "No online CPU is found in the CPU mask " | |
388 | "for CPU partition %d\n", cpt); | |
389 | return 0; | |
390 | } | |
391 | ||
392 | for_each_cpu_mask(i, *mask) { | |
393 | if (!cfs_cpt_set_cpu(cptab, cpt, i)) | |
394 | return 0; | |
395 | } | |
396 | ||
397 | return 1; | |
398 | } | |
399 | EXPORT_SYMBOL(cfs_cpt_set_cpumask); | |
400 | ||
401 | void | |
402 | cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask) | |
403 | { | |
404 | int i; | |
405 | ||
406 | for_each_cpu_mask(i, *mask) | |
407 | cfs_cpt_unset_cpu(cptab, cpt, i); | |
408 | } | |
409 | EXPORT_SYMBOL(cfs_cpt_unset_cpumask); | |
410 | ||
411 | int | |
412 | cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node) | |
413 | { | |
414 | cpumask_t *mask; | |
415 | int rc; | |
416 | ||
417 | if (node < 0 || node >= MAX_NUMNODES) { | |
418 | CDEBUG(D_INFO, | |
419 | "Invalid NUMA id %d for CPU partition %d\n", node, cpt); | |
420 | return 0; | |
421 | } | |
422 | ||
6246dab1 | 423 | mutex_lock(&cpt_data.cpt_mutex); |
d7e09d03 PT |
424 | |
425 | mask = cpt_data.cpt_cpumask; | |
426 | cfs_node_to_cpumask(node, mask); | |
427 | ||
428 | rc = cfs_cpt_set_cpumask(cptab, cpt, mask); | |
429 | ||
6246dab1 | 430 | mutex_unlock(&cpt_data.cpt_mutex); |
d7e09d03 PT |
431 | |
432 | return rc; | |
433 | } | |
434 | EXPORT_SYMBOL(cfs_cpt_set_node); | |
435 | ||
436 | void | |
437 | cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node) | |
438 | { | |
439 | cpumask_t *mask; | |
440 | ||
441 | if (node < 0 || node >= MAX_NUMNODES) { | |
442 | CDEBUG(D_INFO, | |
443 | "Invalid NUMA id %d for CPU partition %d\n", node, cpt); | |
444 | return; | |
445 | } | |
446 | ||
6246dab1 | 447 | mutex_lock(&cpt_data.cpt_mutex); |
d7e09d03 PT |
448 | |
449 | mask = cpt_data.cpt_cpumask; | |
450 | cfs_node_to_cpumask(node, mask); | |
451 | ||
452 | cfs_cpt_unset_cpumask(cptab, cpt, mask); | |
453 | ||
6246dab1 | 454 | mutex_unlock(&cpt_data.cpt_mutex); |
d7e09d03 PT |
455 | } |
456 | EXPORT_SYMBOL(cfs_cpt_unset_node); | |
457 | ||
458 | int | |
459 | cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask) | |
460 | { | |
461 | int i; | |
462 | ||
463 | for_each_node_mask(i, *mask) { | |
464 | if (!cfs_cpt_set_node(cptab, cpt, i)) | |
465 | return 0; | |
466 | } | |
467 | ||
468 | return 1; | |
469 | } | |
470 | EXPORT_SYMBOL(cfs_cpt_set_nodemask); | |
471 | ||
472 | void | |
473 | cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask) | |
474 | { | |
475 | int i; | |
476 | ||
477 | for_each_node_mask(i, *mask) | |
478 | cfs_cpt_unset_node(cptab, cpt, i); | |
479 | } | |
480 | EXPORT_SYMBOL(cfs_cpt_unset_nodemask); | |
481 | ||
482 | void | |
483 | cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt) | |
484 | { | |
485 | int last; | |
486 | int i; | |
487 | ||
488 | if (cpt == CFS_CPT_ANY) { | |
489 | last = cptab->ctb_nparts - 1; | |
490 | cpt = 0; | |
491 | } else { | |
492 | last = cpt; | |
493 | } | |
494 | ||
495 | for (; cpt <= last; cpt++) { | |
496 | for_each_cpu_mask(i, *cptab->ctb_parts[cpt].cpt_cpumask) | |
497 | cfs_cpt_unset_cpu(cptab, cpt, i); | |
498 | } | |
499 | } | |
500 | EXPORT_SYMBOL(cfs_cpt_clear); | |
501 | ||
502 | int | |
503 | cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt) | |
504 | { | |
505 | nodemask_t *mask; | |
506 | int weight; | |
507 | int rotor; | |
508 | int node; | |
509 | ||
510 | /* convert CPU partition ID to HW node id */ | |
511 | ||
512 | if (cpt < 0 || cpt >= cptab->ctb_nparts) { | |
513 | mask = cptab->ctb_nodemask; | |
514 | rotor = cptab->ctb_spread_rotor++; | |
515 | } else { | |
516 | mask = cptab->ctb_parts[cpt].cpt_nodemask; | |
517 | rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++; | |
518 | } | |
519 | ||
520 | weight = nodes_weight(*mask); | |
521 | LASSERT(weight > 0); | |
522 | ||
523 | rotor %= weight; | |
524 | ||
525 | for_each_node_mask(node, *mask) { | |
526 | if (rotor-- == 0) | |
527 | return node; | |
528 | } | |
529 | ||
530 | LBUG(); | |
531 | return 0; | |
532 | } | |
533 | EXPORT_SYMBOL(cfs_cpt_spread_node); | |
534 | ||
535 | int | |
536 | cfs_cpt_current(struct cfs_cpt_table *cptab, int remap) | |
537 | { | |
538 | int cpu = smp_processor_id(); | |
539 | int cpt = cptab->ctb_cpu2cpt[cpu]; | |
540 | ||
541 | if (cpt < 0) { | |
542 | if (!remap) | |
543 | return cpt; | |
544 | ||
545 | /* don't return negative value for safety of upper layer, | |
546 | * instead we shadow the unknown cpu to a valid partition ID */ | |
547 | cpt = cpu % cptab->ctb_nparts; | |
548 | } | |
549 | ||
550 | return cpt; | |
551 | } | |
552 | EXPORT_SYMBOL(cfs_cpt_current); | |
553 | ||
554 | int | |
555 | cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu) | |
556 | { | |
557 | LASSERT(cpu >= 0 && cpu < NR_CPUS); | |
558 | ||
559 | return cptab->ctb_cpu2cpt[cpu]; | |
560 | } | |
561 | EXPORT_SYMBOL(cfs_cpt_of_cpu); | |
562 | ||
563 | int | |
564 | cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt) | |
565 | { | |
566 | cpumask_t *cpumask; | |
567 | nodemask_t *nodemask; | |
568 | int rc; | |
569 | int i; | |
570 | ||
571 | LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); | |
572 | ||
573 | if (cpt == CFS_CPT_ANY) { | |
574 | cpumask = cptab->ctb_cpumask; | |
575 | nodemask = cptab->ctb_nodemask; | |
576 | } else { | |
577 | cpumask = cptab->ctb_parts[cpt].cpt_cpumask; | |
578 | nodemask = cptab->ctb_parts[cpt].cpt_nodemask; | |
579 | } | |
580 | ||
581 | if (any_online_cpu(*cpumask) == NR_CPUS) { | |
582 | CERROR("No online CPU found in CPU partition %d, did someone " | |
583 | "do CPU hotplug on system? You might need to reload " | |
584 | "Lustre modules to keep system working well.\n", cpt); | |
585 | return -EINVAL; | |
586 | } | |
587 | ||
588 | for_each_online_cpu(i) { | |
589 | if (cpu_isset(i, *cpumask)) | |
590 | continue; | |
591 | ||
32654b67 | 592 | rc = set_cpus_allowed_ptr(current, cpumask); |
d7e09d03 PT |
593 | set_mems_allowed(*nodemask); |
594 | if (rc == 0) | |
595 | schedule(); /* switch to allowed CPU */ | |
596 | ||
597 | return rc; | |
598 | } | |
599 | ||
600 | /* don't need to set affinity because all online CPUs are covered */ | |
601 | return 0; | |
602 | } | |
603 | EXPORT_SYMBOL(cfs_cpt_bind); | |
604 | ||
605 | /** | |
606 | * Choose max to \a number CPUs from \a node and set them in \a cpt. | |
607 | * We always prefer to choose CPU in the same core/socket. | |
608 | */ | |
609 | static int | |
610 | cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt, | |
611 | cpumask_t *node, int number) | |
612 | { | |
613 | cpumask_t *socket = NULL; | |
614 | cpumask_t *core = NULL; | |
615 | int rc = 0; | |
616 | int cpu; | |
617 | ||
618 | LASSERT(number > 0); | |
619 | ||
620 | if (number >= cpus_weight(*node)) { | |
621 | while (!cpus_empty(*node)) { | |
622 | cpu = first_cpu(*node); | |
623 | ||
624 | rc = cfs_cpt_set_cpu(cptab, cpt, cpu); | |
625 | if (!rc) | |
626 | return -EINVAL; | |
627 | cpu_clear(cpu, *node); | |
628 | } | |
629 | return 0; | |
630 | } | |
631 | ||
632 | /* allocate scratch buffer */ | |
633 | LIBCFS_ALLOC(socket, cpumask_size()); | |
634 | LIBCFS_ALLOC(core, cpumask_size()); | |
635 | if (socket == NULL || core == NULL) { | |
636 | rc = -ENOMEM; | |
637 | goto out; | |
638 | } | |
639 | ||
640 | while (!cpus_empty(*node)) { | |
641 | cpu = first_cpu(*node); | |
642 | ||
643 | /* get cpumask for cores in the same socket */ | |
644 | cfs_cpu_core_siblings(cpu, socket); | |
645 | cpus_and(*socket, *socket, *node); | |
646 | ||
647 | LASSERT(!cpus_empty(*socket)); | |
648 | ||
649 | while (!cpus_empty(*socket)) { | |
650 | int i; | |
651 | ||
652 | /* get cpumask for hts in the same core */ | |
653 | cfs_cpu_ht_siblings(cpu, core); | |
654 | cpus_and(*core, *core, *node); | |
655 | ||
656 | LASSERT(!cpus_empty(*core)); | |
657 | ||
658 | for_each_cpu_mask(i, *core) { | |
659 | cpu_clear(i, *socket); | |
660 | cpu_clear(i, *node); | |
661 | ||
662 | rc = cfs_cpt_set_cpu(cptab, cpt, i); | |
663 | if (!rc) { | |
664 | rc = -EINVAL; | |
665 | goto out; | |
666 | } | |
667 | ||
668 | if (--number == 0) | |
669 | goto out; | |
670 | } | |
671 | cpu = first_cpu(*socket); | |
672 | } | |
673 | } | |
674 | ||
675 | out: | |
676 | if (socket != NULL) | |
677 | LIBCFS_FREE(socket, cpumask_size()); | |
678 | if (core != NULL) | |
679 | LIBCFS_FREE(core, cpumask_size()); | |
680 | return rc; | |
681 | } | |
682 | ||
683 | #define CPT_WEIGHT_MIN 4u | |
684 | ||
685 | static unsigned int | |
686 | cfs_cpt_num_estimate(void) | |
687 | { | |
688 | unsigned nnode = num_online_nodes(); | |
689 | unsigned ncpu = num_online_cpus(); | |
690 | unsigned ncpt; | |
691 | ||
692 | if (ncpu <= CPT_WEIGHT_MIN) { | |
693 | ncpt = 1; | |
694 | goto out; | |
695 | } | |
696 | ||
697 | /* generate reasonable number of CPU partitions based on total number | |
698 | * of CPUs, Preferred N should be power2 and match this condition: | |
699 | * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 */ | |
700 | for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1) {} | |
701 | ||
702 | if (ncpt <= nnode) { /* fat numa system */ | |
703 | while (nnode > ncpt) | |
704 | nnode >>= 1; | |
705 | ||
706 | } else { /* ncpt > nnode */ | |
707 | while ((nnode << 1) <= ncpt) | |
708 | nnode <<= 1; | |
709 | } | |
710 | ||
711 | ncpt = nnode; | |
712 | ||
713 | out: | |
714 | #if (BITS_PER_LONG == 32) | |
715 | /* config many CPU partitions on 32-bit system could consume | |
716 | * too much memory */ | |
717 | ncpt = min(2U, ncpt); | |
718 | #endif | |
719 | while (ncpu % ncpt != 0) | |
720 | ncpt--; /* worst case is 1 */ | |
721 | ||
722 | return ncpt; | |
723 | } | |
724 | ||
725 | static struct cfs_cpt_table * | |
726 | cfs_cpt_table_create(int ncpt) | |
727 | { | |
728 | struct cfs_cpt_table *cptab = NULL; | |
729 | cpumask_t *mask = NULL; | |
730 | int cpt = 0; | |
731 | int num; | |
732 | int rc; | |
733 | int i; | |
734 | ||
735 | rc = cfs_cpt_num_estimate(); | |
736 | if (ncpt <= 0) | |
737 | ncpt = rc; | |
738 | ||
739 | if (ncpt > num_online_cpus() || ncpt > 4 * rc) { | |
740 | CWARN("CPU partition number %d is larger than suggested " | |
741 | "value (%d), your system may have performance" | |
742 | "issue or run out of memory while under pressure\n", | |
743 | ncpt, rc); | |
744 | } | |
745 | ||
746 | if (num_online_cpus() % ncpt != 0) { | |
747 | CERROR("CPU number %d is not multiple of cpu_npartition %d, " | |
748 | "please try different cpu_npartitions value or" | |
749 | "set pattern string by cpu_pattern=STRING\n", | |
750 | (int)num_online_cpus(), ncpt); | |
751 | goto failed; | |
752 | } | |
753 | ||
754 | cptab = cfs_cpt_table_alloc(ncpt); | |
755 | if (cptab == NULL) { | |
756 | CERROR("Failed to allocate CPU map(%d)\n", ncpt); | |
757 | goto failed; | |
758 | } | |
759 | ||
760 | num = num_online_cpus() / ncpt; | |
761 | if (num == 0) { | |
762 | CERROR("CPU changed while setting CPU partition\n"); | |
763 | goto failed; | |
764 | } | |
765 | ||
766 | LIBCFS_ALLOC(mask, cpumask_size()); | |
767 | if (mask == NULL) { | |
768 | CERROR("Failed to allocate scratch cpumask\n"); | |
769 | goto failed; | |
770 | } | |
771 | ||
772 | for_each_online_node(i) { | |
773 | cfs_node_to_cpumask(i, mask); | |
774 | ||
775 | while (!cpus_empty(*mask)) { | |
776 | struct cfs_cpu_partition *part; | |
777 | int n; | |
778 | ||
779 | if (cpt >= ncpt) | |
780 | goto failed; | |
781 | ||
782 | part = &cptab->ctb_parts[cpt]; | |
783 | ||
784 | n = num - cpus_weight(*part->cpt_cpumask); | |
785 | LASSERT(n > 0); | |
786 | ||
787 | rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n); | |
788 | if (rc < 0) | |
789 | goto failed; | |
790 | ||
791 | LASSERT(num >= cpus_weight(*part->cpt_cpumask)); | |
792 | if (num == cpus_weight(*part->cpt_cpumask)) | |
793 | cpt++; | |
794 | } | |
795 | } | |
796 | ||
797 | if (cpt != ncpt || | |
798 | num != cpus_weight(*cptab->ctb_parts[ncpt - 1].cpt_cpumask)) { | |
799 | CERROR("Expect %d(%d) CPU partitions but got %d(%d), " | |
800 | "CPU hotplug/unplug while setting?\n", | |
801 | cptab->ctb_nparts, num, cpt, | |
802 | cpus_weight(*cptab->ctb_parts[ncpt - 1].cpt_cpumask)); | |
803 | goto failed; | |
804 | } | |
805 | ||
806 | LIBCFS_FREE(mask, cpumask_size()); | |
807 | ||
808 | return cptab; | |
809 | ||
810 | failed: | |
811 | CERROR("Failed to setup CPU-partition-table with %d " | |
812 | "CPU-partitions, online HW nodes: %d, HW cpus: %d.\n", | |
813 | ncpt, num_online_nodes(), num_online_cpus()); | |
814 | ||
815 | if (mask != NULL) | |
816 | LIBCFS_FREE(mask, cpumask_size()); | |
817 | ||
818 | if (cptab != NULL) | |
819 | cfs_cpt_table_free(cptab); | |
820 | ||
821 | return NULL; | |
822 | } | |
823 | ||
824 | static struct cfs_cpt_table * | |
825 | cfs_cpt_table_create_pattern(char *pattern) | |
826 | { | |
827 | struct cfs_cpt_table *cptab; | |
828 | char *str = pattern; | |
829 | int node = 0; | |
830 | int high; | |
831 | int ncpt; | |
832 | int c; | |
833 | ||
834 | for (ncpt = 0;; ncpt++) { /* quick scan bracket */ | |
835 | str = strchr(str, '['); | |
836 | if (str == NULL) | |
837 | break; | |
838 | str++; | |
839 | } | |
840 | ||
841 | str = cfs_trimwhite(pattern); | |
842 | if (*str == 'n' || *str == 'N') { | |
843 | pattern = str + 1; | |
844 | node = 1; | |
845 | } | |
846 | ||
847 | if (ncpt == 0 || | |
848 | (node && ncpt > num_online_nodes()) || | |
849 | (!node && ncpt > num_online_cpus())) { | |
850 | CERROR("Invalid pattern %s, or too many partitions %d\n", | |
851 | pattern, ncpt); | |
852 | return NULL; | |
853 | } | |
854 | ||
855 | high = node ? MAX_NUMNODES - 1 : NR_CPUS - 1; | |
856 | ||
857 | cptab = cfs_cpt_table_alloc(ncpt); | |
858 | if (cptab == NULL) { | |
859 | CERROR("Failed to allocate cpu partition table\n"); | |
860 | return NULL; | |
861 | } | |
862 | ||
863 | for (str = cfs_trimwhite(pattern), c = 0;; c++) { | |
864 | struct cfs_range_expr *range; | |
865 | struct cfs_expr_list *el; | |
866 | char *bracket = strchr(str, '['); | |
867 | int cpt; | |
868 | int rc; | |
869 | int i; | |
870 | int n; | |
871 | ||
872 | if (bracket == NULL) { | |
873 | if (*str != 0) { | |
874 | CERROR("Invalid pattern %s\n", str); | |
875 | goto failed; | |
876 | } else if (c != ncpt) { | |
877 | CERROR("expect %d partitions but found %d\n", | |
878 | ncpt, c); | |
879 | goto failed; | |
880 | } | |
881 | break; | |
882 | } | |
883 | ||
16e9f6d4 | 884 | if (sscanf(str, "%d%n", &cpt, &n) < 1) { |
d7e09d03 PT |
885 | CERROR("Invalid cpu pattern %s\n", str); |
886 | goto failed; | |
887 | } | |
888 | ||
889 | if (cpt < 0 || cpt >= ncpt) { | |
890 | CERROR("Invalid partition id %d, total partitions %d\n", | |
891 | cpt, ncpt); | |
892 | goto failed; | |
893 | } | |
894 | ||
895 | if (cfs_cpt_weight(cptab, cpt) != 0) { | |
896 | CERROR("Partition %d has already been set.\n", cpt); | |
897 | goto failed; | |
898 | } | |
899 | ||
900 | str = cfs_trimwhite(str + n); | |
901 | if (str != bracket) { | |
902 | CERROR("Invalid pattern %s\n", str); | |
903 | goto failed; | |
904 | } | |
905 | ||
906 | bracket = strchr(str, ']'); | |
907 | if (bracket == NULL) { | |
908 | CERROR("missing right bracket for cpt %d, %s\n", | |
909 | cpt, str); | |
910 | goto failed; | |
911 | } | |
912 | ||
913 | if (cfs_expr_list_parse(str, (bracket - str) + 1, | |
914 | 0, high, &el) != 0) { | |
915 | CERROR("Can't parse number range: %s\n", str); | |
916 | goto failed; | |
917 | } | |
918 | ||
919 | list_for_each_entry(range, &el->el_exprs, re_link) { | |
920 | for (i = range->re_lo; i <= range->re_hi; i++) { | |
921 | if ((i - range->re_lo) % range->re_stride != 0) | |
922 | continue; | |
923 | ||
924 | rc = node ? cfs_cpt_set_node(cptab, cpt, i) : | |
925 | cfs_cpt_set_cpu(cptab, cpt, i); | |
926 | if (!rc) { | |
927 | cfs_expr_list_free(el); | |
928 | goto failed; | |
929 | } | |
930 | } | |
931 | } | |
932 | ||
933 | cfs_expr_list_free(el); | |
934 | ||
935 | if (!cfs_cpt_online(cptab, cpt)) { | |
936 | CERROR("No online CPU is found on partition %d\n", cpt); | |
937 | goto failed; | |
938 | } | |
939 | ||
940 | str = cfs_trimwhite(bracket + 1); | |
941 | } | |
942 | ||
943 | return cptab; | |
944 | ||
945 | failed: | |
946 | cfs_cpt_table_free(cptab); | |
947 | return NULL; | |
948 | } | |
949 | ||
950 | #ifdef CONFIG_HOTPLUG_CPU | |
951 | static int | |
952 | cfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |
953 | { | |
954 | unsigned int cpu = (unsigned long)hcpu; | |
6fd67d85 | 955 | bool warn; |
d7e09d03 PT |
956 | |
957 | switch (action) { | |
958 | case CPU_DEAD: | |
959 | case CPU_DEAD_FROZEN: | |
960 | case CPU_ONLINE: | |
961 | case CPU_ONLINE_FROZEN: | |
962 | spin_lock(&cpt_data.cpt_lock); | |
963 | cpt_data.cpt_version++; | |
964 | spin_unlock(&cpt_data.cpt_lock); | |
965 | default: | |
6fd67d85 OD |
966 | if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) { |
967 | CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n", | |
968 | cpu, action); | |
969 | break; | |
970 | } | |
971 | ||
6246dab1 | 972 | mutex_lock(&cpt_data.cpt_mutex); |
6fd67d85 OD |
973 | /* if all HTs in a core are offline, it may break affinity */ |
974 | cfs_cpu_ht_siblings(cpu, cpt_data.cpt_cpumask); | |
975 | warn = any_online_cpu(*cpt_data.cpt_cpumask) >= nr_cpu_ids; | |
6246dab1 | 976 | mutex_unlock(&cpt_data.cpt_mutex); |
6fd67d85 OD |
977 | CDEBUG(warn ? D_WARNING : D_INFO, |
978 | "Lustre: can't support CPU plug-out well now, " | |
979 | "performance and stability could be impacted " | |
980 | "[CPU %u action: %lx]\n", cpu, action); | |
d7e09d03 PT |
981 | } |
982 | ||
983 | return NOTIFY_OK; | |
984 | } | |
985 | ||
986 | static struct notifier_block cfs_cpu_notifier = { | |
987 | .notifier_call = cfs_cpu_notify, | |
988 | .priority = 0 | |
989 | }; | |
990 | ||
991 | #endif | |
992 | ||
993 | void | |
994 | cfs_cpu_fini(void) | |
995 | { | |
996 | if (cfs_cpt_table != NULL) | |
997 | cfs_cpt_table_free(cfs_cpt_table); | |
998 | ||
999 | #ifdef CONFIG_HOTPLUG_CPU | |
1000 | unregister_hotcpu_notifier(&cfs_cpu_notifier); | |
1001 | #endif | |
1002 | if (cpt_data.cpt_cpumask != NULL) | |
1003 | LIBCFS_FREE(cpt_data.cpt_cpumask, cpumask_size()); | |
1004 | } | |
1005 | ||
1006 | int | |
1007 | cfs_cpu_init(void) | |
1008 | { | |
1009 | LASSERT(cfs_cpt_table == NULL); | |
1010 | ||
1011 | memset(&cpt_data, 0, sizeof(cpt_data)); | |
1012 | ||
1013 | LIBCFS_ALLOC(cpt_data.cpt_cpumask, cpumask_size()); | |
1014 | if (cpt_data.cpt_cpumask == NULL) { | |
1015 | CERROR("Failed to allocate scratch buffer\n"); | |
1016 | return -1; | |
1017 | } | |
1018 | ||
1019 | spin_lock_init(&cpt_data.cpt_lock); | |
6246dab1 | 1020 | mutex_init(&cpt_data.cpt_mutex); |
d7e09d03 PT |
1021 | |
1022 | #ifdef CONFIG_HOTPLUG_CPU | |
1023 | register_hotcpu_notifier(&cfs_cpu_notifier); | |
1024 | #endif | |
1025 | ||
1026 | if (*cpu_pattern != 0) { | |
1027 | cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern); | |
1028 | if (cfs_cpt_table == NULL) { | |
1029 | CERROR("Failed to create cptab from pattern %s\n", | |
1030 | cpu_pattern); | |
1031 | goto failed; | |
1032 | } | |
1033 | ||
1034 | } else { | |
1035 | cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions); | |
1036 | if (cfs_cpt_table == NULL) { | |
1037 | CERROR("Failed to create ptable with npartitions %d\n", | |
1038 | cpu_npartitions); | |
1039 | goto failed; | |
1040 | } | |
1041 | } | |
1042 | ||
1043 | spin_lock(&cpt_data.cpt_lock); | |
1044 | if (cfs_cpt_table->ctb_version != cpt_data.cpt_version) { | |
1045 | spin_unlock(&cpt_data.cpt_lock); | |
1046 | CERROR("CPU hotplug/unplug during setup\n"); | |
1047 | goto failed; | |
1048 | } | |
1049 | spin_unlock(&cpt_data.cpt_lock); | |
1050 | ||
1051 | LCONSOLE(0, "HW CPU cores: %d, npartitions: %d\n", | |
1052 | num_online_cpus(), cfs_cpt_number(cfs_cpt_table)); | |
1053 | return 0; | |
1054 | ||
1055 | failed: | |
1056 | cfs_cpu_fini(); | |
1057 | return -1; | |
1058 | } | |
1059 | ||
1060 | #endif |