Commit | Line | Data |
---|---|---|
ef6695f1 MD |
1 | // SPDX-License-Identifier: MIT |
2 | // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> | |
3 | ||
4 | #include <rseq/percpu-alloc.h> | |
5 | #include <sys/mman.h> | |
6 | #include <assert.h> | |
7 | #include <string.h> | |
8 | #include <pthread.h> | |
9 | #include <unistd.h> | |
10 | #include <stdlib.h> | |
11 | #include <rseq/compiler.h> | |
12 | #include <errno.h> | |
13 | #include <stdint.h> | |
14 | #include <stdbool.h> | |
367e559c MD |
15 | #include <stdio.h> |
16 | ||
17 | #ifdef HAVE_LIBNUMA | |
18 | # include <numa.h> | |
19 | # include <numaif.h> | |
20 | #endif | |
ef6695f1 MD |
21 | |
22 | /* | |
23 | * rseq-percpu-alloc.c: rseq per-cpu memory allocator. | |
24 | * | |
25 | * The rseq per-cpu memory allocator allows the application the request | |
26 | * memory pools of per-cpu memory each of a given virtual address size | |
27 | * per-cpu, for a given maximum number of CPUs. | |
28 | */ | |
29 | ||
30 | /* Use lowest bits of per-cpu addresses to index the pool. */ | |
31 | #if RSEQ_BITS_PER_LONG == 64 | |
32 | # define OFFSET_SHIFT 16 | |
33 | #else | |
34 | # define OFFSET_SHIFT 8 | |
35 | #endif | |
36 | #define MAX_NR_POOLS (1U << OFFSET_SHIFT) | |
37 | #define POOL_MASK (MAX_NR_POOLS - 1) | |
38 | ||
39 | #define POOL_SET_NR_ENTRIES (RSEQ_BITS_PER_LONG - OFFSET_SHIFT) | |
40 | #define POOL_SET_MAX_ALLOC_LEN (1U << POOL_SET_NR_ENTRIES) | |
41 | ||
42 | #if RSEQ_BITS_PER_LONG == 64 | |
43 | # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */ | |
44 | #else | |
45 | # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */ | |
46 | #endif | |
47 | ||
48 | #define DEFAULT_PAGE_SIZE 4096 | |
49 | ||
50 | struct free_list_node; | |
51 | ||
52 | struct free_list_node { | |
53 | struct free_list_node *next; | |
54 | }; | |
55 | ||
56 | /* This lock protects pool create/destroy. */ | |
57 | static pthread_mutex_t pool_lock = PTHREAD_MUTEX_INITIALIZER; | |
58 | ||
59 | struct rseq_percpu_pool { | |
60 | void *base; | |
61 | unsigned int index; | |
62 | size_t item_len; | |
63 | size_t percpu_len; | |
64 | int item_order; | |
65 | int max_nr_cpus; | |
66 | ||
67 | /* | |
68 | * The free list chains freed items on the cpu 0 address range. | |
69 | * We should rethink this decision if false sharing between | |
70 | * malloc/free from other cpus and data accesses from cpu 0 | |
71 | * becomes an issue. This is a NULL-terminated singly-linked | |
72 | * list. | |
73 | */ | |
74 | struct free_list_node *free_list_head; | |
75 | size_t next_unused; | |
76 | /* This lock protects allocation/free within the pool. */ | |
77 | pthread_mutex_t lock; | |
78 | }; | |
79 | ||
80 | //TODO: the array of pools should grow dynamically on create. | |
81 | static struct rseq_percpu_pool rseq_percpu_pool[MAX_NR_POOLS]; | |
82 | ||
83 | /* | |
84 | * Pool set entries are indexed by item_len rounded to the next power of | |
85 | * 2. A pool set can contain NULL pool entries, in which case the next | |
86 | * large enough entry will be used for allocation. | |
87 | */ | |
88 | struct rseq_percpu_pool_set { | |
89 | /* This lock protects add vs malloc/zmalloc within the pool set. */ | |
90 | pthread_mutex_t lock; | |
91 | struct rseq_percpu_pool *entries[POOL_SET_NR_ENTRIES]; | |
92 | }; | |
93 | ||
94 | #define __rseq_align_mask(v, mask) (((v) + (mask)) & ~(mask)) | |
95 | #define rseq_align(v, align) __rseq_align_mask(v, (__typeof__(v)) (align) - 1) | |
96 | ||
97 | static __attribute__((unused)) | |
98 | unsigned int fls_u64(uint64_t x) | |
99 | { | |
100 | unsigned int r = 64; | |
101 | ||
102 | if (!x) | |
103 | return 0; | |
104 | ||
105 | if (!(x & 0xFFFFFFFF00000000ULL)) { | |
106 | x <<= 32; | |
107 | r -= 32; | |
108 | } | |
109 | if (!(x & 0xFFFF000000000000ULL)) { | |
110 | x <<= 16; | |
111 | r -= 16; | |
112 | } | |
113 | if (!(x & 0xFF00000000000000ULL)) { | |
114 | x <<= 8; | |
115 | r -= 8; | |
116 | } | |
117 | if (!(x & 0xF000000000000000ULL)) { | |
118 | x <<= 4; | |
119 | r -= 4; | |
120 | } | |
121 | if (!(x & 0xC000000000000000ULL)) { | |
122 | x <<= 2; | |
123 | r -= 2; | |
124 | } | |
125 | if (!(x & 0x8000000000000000ULL)) { | |
126 | x <<= 1; | |
127 | r -= 1; | |
128 | } | |
129 | return r; | |
130 | } | |
131 | ||
132 | static __attribute__((unused)) | |
133 | unsigned int fls_u32(uint32_t x) | |
134 | { | |
135 | unsigned int r = 32; | |
136 | ||
137 | if (!x) | |
138 | return 0; | |
139 | if (!(x & 0xFFFF0000U)) { | |
140 | x <<= 16; | |
141 | r -= 16; | |
142 | } | |
143 | if (!(x & 0xFF000000U)) { | |
144 | x <<= 8; | |
145 | r -= 8; | |
146 | } | |
147 | if (!(x & 0xF0000000U)) { | |
148 | x <<= 4; | |
149 | r -= 4; | |
150 | } | |
151 | if (!(x & 0xC0000000U)) { | |
152 | x <<= 2; | |
153 | r -= 2; | |
154 | } | |
155 | if (!(x & 0x80000000U)) { | |
156 | x <<= 1; | |
157 | r -= 1; | |
158 | } | |
159 | return r; | |
160 | } | |
161 | ||
162 | static | |
163 | unsigned int fls_ulong(unsigned long x) | |
164 | { | |
165 | #if RSEQ_BITS_PER_LONG == 32 | |
166 | return fls_u32(x); | |
167 | #else | |
168 | return fls_u64(x); | |
169 | #endif | |
170 | } | |
171 | ||
172 | /* | |
173 | * Return the minimum order for which x <= (1UL << order). | |
174 | * Return -1 if x is 0. | |
175 | */ | |
176 | static | |
177 | int get_count_order_ulong(unsigned long x) | |
178 | { | |
179 | if (!x) | |
180 | return -1; | |
181 | ||
182 | return fls_ulong(x - 1); | |
183 | } | |
184 | ||
367e559c MD |
185 | static |
186 | long rseq_get_page_len(void) | |
187 | { | |
188 | long page_len = sysconf(_SC_PAGE_SIZE); | |
189 | ||
190 | if (page_len < 0) | |
191 | page_len = DEFAULT_PAGE_SIZE; | |
192 | return page_len; | |
193 | } | |
194 | ||
195 | static | |
196 | void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool *pool, int cpu, uintptr_t item_offset) | |
197 | { | |
198 | return pool->base + (pool->percpu_len * cpu) + item_offset; | |
199 | } | |
200 | ||
201 | void *__rseq_percpu_ptr(void *_ptr, int cpu) | |
202 | { | |
203 | uintptr_t ptr = (uintptr_t) _ptr; | |
204 | uintptr_t item_offset = ptr >> OFFSET_SHIFT; | |
205 | uintptr_t pool_index = ptr & POOL_MASK; | |
206 | struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index]; | |
207 | ||
208 | assert(cpu >= 0); | |
209 | return __rseq_pool_percpu_ptr(pool, cpu, item_offset); | |
210 | } | |
211 | ||
212 | static | |
213 | void rseq_percpu_zero_item(struct rseq_percpu_pool *pool, uintptr_t item_offset) | |
214 | { | |
215 | int i; | |
216 | ||
217 | for (i = 0; i < pool->max_nr_cpus; i++) { | |
218 | char *p = __rseq_pool_percpu_ptr(pool, i, item_offset); | |
219 | memset(p, 0, pool->item_len); | |
220 | } | |
221 | } | |
222 | ||
223 | #ifdef HAVE_LIBNUMA | |
224 | static | |
225 | void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool, | |
226 | int numa_flags) | |
227 | { | |
228 | unsigned long nr_pages, page; | |
229 | long ret, page_len; | |
230 | int cpu; | |
231 | ||
232 | if (!numa_flags) | |
233 | return; | |
234 | page_len = rseq_get_page_len(); | |
235 | nr_pages = pool->percpu_len >> get_count_order_ulong(page_len); | |
236 | for (cpu = 0; cpu < pool->max_nr_cpus; cpu++) { | |
237 | int node = numa_node_of_cpu(cpu); | |
238 | ||
239 | /* TODO: batch move_pages() call with an array of pages. */ | |
240 | for (page = 0; page < nr_pages; page++) { | |
241 | void *pageptr = __rseq_pool_percpu_ptr(pool, cpu, page * page_len); | |
242 | int status = -EPERM; | |
243 | ||
244 | ret = move_pages(0, 1, &pageptr, &node, &status, numa_flags); | |
245 | if (ret) { | |
246 | perror("move_pages"); | |
247 | abort(); | |
248 | } | |
249 | } | |
250 | } | |
251 | } | |
252 | #else | |
253 | static | |
254 | void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool __attribute__((unused)), | |
255 | int numa_flags __attribute__((unused))) | |
256 | { | |
257 | } | |
258 | #endif | |
259 | ||
260 | /* | |
261 | * Expected numa_flags: | |
262 | * 0: do not move pages to specific numa nodes (use for e.g. mm_cid indexing). | |
263 | * MPOL_MF_MOVE: move process-private pages to cpu-specific numa nodes. | |
264 | * MPOL_MF_MOVE_ALL: move shared pages to cpu-specific numa nodes (requires CAP_SYS_NICE). | |
265 | */ | |
ef6695f1 MD |
266 | struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len, |
267 | size_t percpu_len, int max_nr_cpus, | |
367e559c MD |
268 | int mmap_prot, int mmap_flags, int mmap_fd, |
269 | off_t mmap_offset, int numa_flags) | |
ef6695f1 MD |
270 | { |
271 | struct rseq_percpu_pool *pool; | |
272 | void *base; | |
273 | unsigned int i; | |
274 | int order; | |
ef6695f1 MD |
275 | |
276 | /* Make sure each item is large enough to contain free list pointers. */ | |
277 | if (item_len < sizeof(void *)) | |
278 | item_len = sizeof(void *); | |
279 | ||
280 | /* Align item_len on next power of two. */ | |
281 | order = get_count_order_ulong(item_len); | |
282 | if (order < 0) { | |
283 | errno = EINVAL; | |
284 | return NULL; | |
285 | } | |
286 | item_len = 1UL << order; | |
287 | ||
288 | /* Align percpu_len on page size. */ | |
367e559c | 289 | percpu_len = rseq_align(percpu_len, rseq_get_page_len()); |
ef6695f1 MD |
290 | |
291 | if (max_nr_cpus < 0 || item_len > percpu_len || | |
292 | percpu_len > (UINTPTR_MAX >> OFFSET_SHIFT)) { | |
293 | errno = EINVAL; | |
294 | return NULL; | |
295 | } | |
296 | ||
297 | pthread_mutex_lock(&pool_lock); | |
298 | /* Linear scan in array of pools to find empty spot. */ | |
299 | for (i = 0; i < MAX_NR_POOLS; i++) { | |
300 | pool = &rseq_percpu_pool[i]; | |
301 | if (!pool->base) | |
302 | goto found_empty; | |
303 | } | |
304 | errno = ENOMEM; | |
305 | pool = NULL; | |
306 | goto end; | |
307 | ||
308 | found_empty: | |
367e559c MD |
309 | base = mmap(NULL, percpu_len * max_nr_cpus, mmap_prot, |
310 | mmap_flags, mmap_fd, mmap_offset); | |
ef6695f1 MD |
311 | if (base == MAP_FAILED) { |
312 | pool = NULL; | |
313 | goto end; | |
314 | } | |
367e559c | 315 | rseq_percpu_pool_init_numa(pool, numa_flags); |
ef6695f1 MD |
316 | pthread_mutex_init(&pool->lock, NULL); |
317 | pool->base = base; | |
318 | pool->percpu_len = percpu_len; | |
319 | pool->max_nr_cpus = max_nr_cpus; | |
320 | pool->index = i; | |
321 | pool->item_len = item_len; | |
322 | pool->item_order = order; | |
323 | end: | |
324 | pthread_mutex_unlock(&pool_lock); | |
325 | return pool; | |
326 | } | |
327 | ||
328 | int rseq_percpu_pool_destroy(struct rseq_percpu_pool *pool) | |
329 | { | |
330 | int ret; | |
331 | ||
332 | pthread_mutex_lock(&pool_lock); | |
333 | if (!pool->base) { | |
334 | errno = ENOENT; | |
335 | ret = -1; | |
336 | goto end; | |
337 | } | |
338 | ret = munmap(pool->base, pool->percpu_len * pool->max_nr_cpus); | |
339 | if (ret) | |
340 | goto end; | |
341 | pthread_mutex_destroy(&pool->lock); | |
342 | memset(pool, 0, sizeof(*pool)); | |
343 | end: | |
344 | pthread_mutex_unlock(&pool_lock); | |
345 | return 0; | |
346 | } | |
347 | ||
ef6695f1 MD |
348 | static |
349 | void *__rseq_percpu_malloc(struct rseq_percpu_pool *pool, bool zeroed) | |
350 | { | |
351 | struct free_list_node *node; | |
352 | uintptr_t item_offset; | |
353 | void *addr; | |
354 | ||
355 | pthread_mutex_lock(&pool->lock); | |
356 | /* Get first entry from free list. */ | |
357 | node = pool->free_list_head; | |
358 | if (node != NULL) { | |
359 | /* Remove node from free list (update head). */ | |
360 | pool->free_list_head = node->next; | |
361 | item_offset = (uintptr_t) ((void *) node - pool->base); | |
362 | addr = (void *) ((item_offset << OFFSET_SHIFT) | pool->index); | |
363 | goto end; | |
364 | } | |
365 | if (pool->next_unused + pool->item_len > pool->percpu_len) { | |
366 | addr = NULL; | |
367 | goto end; | |
368 | } | |
369 | item_offset = pool->next_unused; | |
370 | addr = (void *) ((item_offset << OFFSET_SHIFT) | pool->index); | |
371 | pool->next_unused += pool->item_len; | |
372 | end: | |
373 | pthread_mutex_unlock(&pool->lock); | |
374 | if (zeroed && addr) | |
375 | rseq_percpu_zero_item(pool, item_offset); | |
376 | return addr; | |
377 | } | |
378 | ||
379 | void *rseq_percpu_malloc(struct rseq_percpu_pool *pool) | |
380 | { | |
381 | return __rseq_percpu_malloc(pool, false); | |
382 | } | |
383 | ||
384 | void *rseq_percpu_zmalloc(struct rseq_percpu_pool *pool) | |
385 | { | |
386 | return __rseq_percpu_malloc(pool, true); | |
387 | } | |
388 | ||
389 | void rseq_percpu_free(void *_ptr) | |
390 | { | |
391 | uintptr_t ptr = (uintptr_t) _ptr; | |
392 | uintptr_t item_offset = ptr >> OFFSET_SHIFT; | |
393 | uintptr_t pool_index = ptr & POOL_MASK; | |
394 | struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index]; | |
395 | struct free_list_node *head, *item; | |
396 | ||
397 | pthread_mutex_lock(&pool->lock); | |
398 | /* Add ptr to head of free list */ | |
399 | head = pool->free_list_head; | |
400 | /* Free-list is in cpu 0 range. */ | |
401 | item = (struct free_list_node *)__rseq_pool_percpu_ptr(pool, 0, item_offset); | |
402 | item->next = head; | |
403 | pool->free_list_head = item; | |
404 | pthread_mutex_unlock(&pool->lock); | |
405 | } | |
406 | ||
407 | struct rseq_percpu_pool_set *rseq_percpu_pool_set_create(void) | |
408 | { | |
409 | struct rseq_percpu_pool_set *pool_set; | |
410 | ||
411 | pool_set = calloc(1, sizeof(struct rseq_percpu_pool_set)); | |
412 | if (!pool_set) | |
413 | return NULL; | |
414 | pthread_mutex_init(&pool_set->lock, NULL); | |
415 | return pool_set; | |
416 | } | |
417 | ||
418 | int rseq_percpu_pool_set_destroy(struct rseq_percpu_pool_set *pool_set) | |
419 | { | |
420 | int order, ret; | |
421 | ||
422 | for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) { | |
423 | struct rseq_percpu_pool *pool = pool_set->entries[order]; | |
424 | ||
425 | if (!pool) | |
426 | continue; | |
427 | ret = rseq_percpu_pool_destroy(pool); | |
428 | if (ret) | |
429 | return ret; | |
430 | pool_set->entries[order] = NULL; | |
431 | } | |
432 | pthread_mutex_destroy(&pool_set->lock); | |
433 | free(pool_set); | |
434 | return 0; | |
435 | } | |
436 | ||
437 | /* Ownership of pool is handed over to pool set on success. */ | |
438 | int rseq_percpu_pool_set_add_pool(struct rseq_percpu_pool_set *pool_set, struct rseq_percpu_pool *pool) | |
439 | { | |
440 | size_t item_order = pool->item_order; | |
441 | int ret = 0; | |
442 | ||
443 | pthread_mutex_lock(&pool_set->lock); | |
444 | if (pool_set->entries[item_order]) { | |
445 | errno = EBUSY; | |
446 | ret = -1; | |
447 | goto end; | |
448 | } | |
449 | pool_set->entries[pool->item_order] = pool; | |
450 | end: | |
451 | pthread_mutex_unlock(&pool_set->lock); | |
452 | return ret; | |
453 | } | |
454 | ||
455 | static | |
456 | void *__rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len, bool zeroed) | |
457 | { | |
458 | int order, min_order = POOL_SET_MIN_ENTRY; | |
459 | struct rseq_percpu_pool *pool; | |
460 | void *addr; | |
461 | ||
462 | again: | |
463 | pthread_mutex_lock(&pool_set->lock); | |
464 | /* First smallest present pool where @len fits. */ | |
465 | for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) { | |
466 | pool = pool_set->entries[order]; | |
467 | ||
468 | if (!pool) | |
469 | continue; | |
470 | if (pool->item_len >= len) | |
471 | goto found; | |
472 | } | |
473 | pool = NULL; | |
474 | found: | |
475 | pthread_mutex_unlock(&pool_set->lock); | |
476 | if (pool) { | |
477 | addr = __rseq_percpu_malloc(pool, zeroed); | |
478 | if (addr == NULL && errno == ENOMEM) { | |
479 | /* | |
480 | * If the allocation failed, try again with a | |
481 | * larger pool. | |
482 | */ | |
483 | min_order = order + 1; | |
484 | goto again; | |
485 | } | |
486 | } else { | |
487 | /* Not found. */ | |
488 | errno = ENOMEM; | |
489 | addr = NULL; | |
490 | } | |
491 | return addr; | |
492 | } | |
493 | ||
494 | void *rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len) | |
495 | { | |
496 | return __rseq_percpu_pool_set_malloc(pool_set, len, false); | |
497 | } | |
498 | ||
499 | void *rseq_percpu_pool_set_zmalloc(struct rseq_percpu_pool_set *pool_set, size_t len) | |
500 | { | |
501 | return __rseq_percpu_pool_set_malloc(pool_set, len, true); | |
502 | } |