Commit | Line | Data |
---|---|---|
ef6695f1 MD |
1 | // SPDX-License-Identifier: MIT |
2 | // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> | |
3 | ||
4 | #include <rseq/percpu-alloc.h> | |
5 | #include <sys/mman.h> | |
6 | #include <assert.h> | |
7 | #include <string.h> | |
8 | #include <pthread.h> | |
9 | #include <unistd.h> | |
10 | #include <stdlib.h> | |
11 | #include <rseq/compiler.h> | |
12 | #include <errno.h> | |
13 | #include <stdint.h> | |
14 | #include <stdbool.h> | |
367e559c MD |
15 | #include <stdio.h> |
16 | ||
17 | #ifdef HAVE_LIBNUMA | |
18 | # include <numa.h> | |
19 | # include <numaif.h> | |
20 | #endif | |
ef6695f1 MD |
21 | |
22 | /* | |
23 | * rseq-percpu-alloc.c: rseq per-cpu memory allocator. | |
24 | * | |
25 | * The rseq per-cpu memory allocator allows the application the request | |
26 | * memory pools of per-cpu memory each of a given virtual address size | |
27 | * per-cpu, for a given maximum number of CPUs. | |
28 | */ | |
29 | ||
72b100a1 MD |
30 | /* |
31 | * Use high bits of per-cpu addresses to index the pool. | |
32 | * This leaves the low bits of available to the application for pointer | |
33 | * tagging (based on next power of 2 alignment of the allocations). | |
34 | */ | |
ef6695f1 | 35 | #if RSEQ_BITS_PER_LONG == 64 |
72b100a1 | 36 | # define POOL_INDEX_BITS 16 |
ef6695f1 | 37 | #else |
72b100a1 | 38 | # define POOL_INDEX_BITS 8 |
ef6695f1 | 39 | #endif |
72b100a1 MD |
40 | #define MAX_NR_POOLS (1UL << POOL_INDEX_BITS) |
41 | #define POOL_INDEX_SHIFT (RSEQ_BITS_PER_LONG - POOL_INDEX_BITS) | |
42 | #define MAX_POOL_LEN (1UL << POOL_INDEX_SHIFT) | |
43 | #define MAX_POOL_LEN_MASK (MAX_POOL_LEN - 1) | |
ef6695f1 | 44 | |
72b100a1 | 45 | #define POOL_SET_NR_ENTRIES POOL_INDEX_SHIFT |
ef6695f1 | 46 | |
72b100a1 MD |
47 | /* |
48 | * Smallest allocation should hold enough space for a free list pointer. | |
49 | */ | |
ef6695f1 MD |
50 | #if RSEQ_BITS_PER_LONG == 64 |
51 | # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */ | |
52 | #else | |
53 | # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */ | |
54 | #endif | |
55 | ||
56 | #define DEFAULT_PAGE_SIZE 4096 | |
57 | ||
58 | struct free_list_node; | |
59 | ||
60 | struct free_list_node { | |
61 | struct free_list_node *next; | |
62 | }; | |
63 | ||
64 | /* This lock protects pool create/destroy. */ | |
65 | static pthread_mutex_t pool_lock = PTHREAD_MUTEX_INITIALIZER; | |
66 | ||
67 | struct rseq_percpu_pool { | |
68 | void *base; | |
69 | unsigned int index; | |
70 | size_t item_len; | |
71 | size_t percpu_len; | |
72 | int item_order; | |
73 | int max_nr_cpus; | |
74 | ||
75 | /* | |
76 | * The free list chains freed items on the cpu 0 address range. | |
77 | * We should rethink this decision if false sharing between | |
78 | * malloc/free from other cpus and data accesses from cpu 0 | |
79 | * becomes an issue. This is a NULL-terminated singly-linked | |
80 | * list. | |
81 | */ | |
82 | struct free_list_node *free_list_head; | |
83 | size_t next_unused; | |
84 | /* This lock protects allocation/free within the pool. */ | |
85 | pthread_mutex_t lock; | |
86 | }; | |
87 | ||
88 | //TODO: the array of pools should grow dynamically on create. | |
89 | static struct rseq_percpu_pool rseq_percpu_pool[MAX_NR_POOLS]; | |
90 | ||
91 | /* | |
92 | * Pool set entries are indexed by item_len rounded to the next power of | |
93 | * 2. A pool set can contain NULL pool entries, in which case the next | |
94 | * large enough entry will be used for allocation. | |
95 | */ | |
96 | struct rseq_percpu_pool_set { | |
97 | /* This lock protects add vs malloc/zmalloc within the pool set. */ | |
98 | pthread_mutex_t lock; | |
99 | struct rseq_percpu_pool *entries[POOL_SET_NR_ENTRIES]; | |
100 | }; | |
101 | ||
102 | #define __rseq_align_mask(v, mask) (((v) + (mask)) & ~(mask)) | |
103 | #define rseq_align(v, align) __rseq_align_mask(v, (__typeof__(v)) (align) - 1) | |
104 | ||
105 | static __attribute__((unused)) | |
106 | unsigned int fls_u64(uint64_t x) | |
107 | { | |
108 | unsigned int r = 64; | |
109 | ||
110 | if (!x) | |
111 | return 0; | |
112 | ||
113 | if (!(x & 0xFFFFFFFF00000000ULL)) { | |
114 | x <<= 32; | |
115 | r -= 32; | |
116 | } | |
117 | if (!(x & 0xFFFF000000000000ULL)) { | |
118 | x <<= 16; | |
119 | r -= 16; | |
120 | } | |
121 | if (!(x & 0xFF00000000000000ULL)) { | |
122 | x <<= 8; | |
123 | r -= 8; | |
124 | } | |
125 | if (!(x & 0xF000000000000000ULL)) { | |
126 | x <<= 4; | |
127 | r -= 4; | |
128 | } | |
129 | if (!(x & 0xC000000000000000ULL)) { | |
130 | x <<= 2; | |
131 | r -= 2; | |
132 | } | |
133 | if (!(x & 0x8000000000000000ULL)) { | |
134 | x <<= 1; | |
135 | r -= 1; | |
136 | } | |
137 | return r; | |
138 | } | |
139 | ||
140 | static __attribute__((unused)) | |
141 | unsigned int fls_u32(uint32_t x) | |
142 | { | |
143 | unsigned int r = 32; | |
144 | ||
145 | if (!x) | |
146 | return 0; | |
147 | if (!(x & 0xFFFF0000U)) { | |
148 | x <<= 16; | |
149 | r -= 16; | |
150 | } | |
151 | if (!(x & 0xFF000000U)) { | |
152 | x <<= 8; | |
153 | r -= 8; | |
154 | } | |
155 | if (!(x & 0xF0000000U)) { | |
156 | x <<= 4; | |
157 | r -= 4; | |
158 | } | |
159 | if (!(x & 0xC0000000U)) { | |
160 | x <<= 2; | |
161 | r -= 2; | |
162 | } | |
163 | if (!(x & 0x80000000U)) { | |
164 | x <<= 1; | |
165 | r -= 1; | |
166 | } | |
167 | return r; | |
168 | } | |
169 | ||
170 | static | |
171 | unsigned int fls_ulong(unsigned long x) | |
172 | { | |
173 | #if RSEQ_BITS_PER_LONG == 32 | |
174 | return fls_u32(x); | |
175 | #else | |
176 | return fls_u64(x); | |
177 | #endif | |
178 | } | |
179 | ||
180 | /* | |
181 | * Return the minimum order for which x <= (1UL << order). | |
182 | * Return -1 if x is 0. | |
183 | */ | |
184 | static | |
185 | int get_count_order_ulong(unsigned long x) | |
186 | { | |
187 | if (!x) | |
188 | return -1; | |
189 | ||
190 | return fls_ulong(x - 1); | |
191 | } | |
192 | ||
367e559c MD |
193 | static |
194 | long rseq_get_page_len(void) | |
195 | { | |
196 | long page_len = sysconf(_SC_PAGE_SIZE); | |
197 | ||
198 | if (page_len < 0) | |
199 | page_len = DEFAULT_PAGE_SIZE; | |
200 | return page_len; | |
201 | } | |
202 | ||
203 | static | |
204 | void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool *pool, int cpu, uintptr_t item_offset) | |
205 | { | |
206 | return pool->base + (pool->percpu_len * cpu) + item_offset; | |
207 | } | |
208 | ||
209 | void *__rseq_percpu_ptr(void *_ptr, int cpu) | |
210 | { | |
211 | uintptr_t ptr = (uintptr_t) _ptr; | |
72b100a1 MD |
212 | uintptr_t item_offset = ptr & MAX_POOL_LEN_MASK; |
213 | uintptr_t pool_index = ptr >> POOL_INDEX_SHIFT; | |
367e559c MD |
214 | struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index]; |
215 | ||
216 | assert(cpu >= 0); | |
217 | return __rseq_pool_percpu_ptr(pool, cpu, item_offset); | |
218 | } | |
219 | ||
220 | static | |
221 | void rseq_percpu_zero_item(struct rseq_percpu_pool *pool, uintptr_t item_offset) | |
222 | { | |
223 | int i; | |
224 | ||
225 | for (i = 0; i < pool->max_nr_cpus; i++) { | |
226 | char *p = __rseq_pool_percpu_ptr(pool, i, item_offset); | |
227 | memset(p, 0, pool->item_len); | |
228 | } | |
229 | } | |
230 | ||
231 | #ifdef HAVE_LIBNUMA | |
232 | static | |
233 | void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool, | |
234 | int numa_flags) | |
235 | { | |
236 | unsigned long nr_pages, page; | |
237 | long ret, page_len; | |
238 | int cpu; | |
239 | ||
240 | if (!numa_flags) | |
241 | return; | |
242 | page_len = rseq_get_page_len(); | |
243 | nr_pages = pool->percpu_len >> get_count_order_ulong(page_len); | |
244 | for (cpu = 0; cpu < pool->max_nr_cpus; cpu++) { | |
245 | int node = numa_node_of_cpu(cpu); | |
246 | ||
247 | /* TODO: batch move_pages() call with an array of pages. */ | |
248 | for (page = 0; page < nr_pages; page++) { | |
249 | void *pageptr = __rseq_pool_percpu_ptr(pool, cpu, page * page_len); | |
250 | int status = -EPERM; | |
251 | ||
252 | ret = move_pages(0, 1, &pageptr, &node, &status, numa_flags); | |
253 | if (ret) { | |
254 | perror("move_pages"); | |
255 | abort(); | |
256 | } | |
257 | } | |
258 | } | |
259 | } | |
260 | #else | |
261 | static | |
262 | void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool __attribute__((unused)), | |
263 | int numa_flags __attribute__((unused))) | |
264 | { | |
265 | } | |
266 | #endif | |
267 | ||
268 | /* | |
269 | * Expected numa_flags: | |
270 | * 0: do not move pages to specific numa nodes (use for e.g. mm_cid indexing). | |
271 | * MPOL_MF_MOVE: move process-private pages to cpu-specific numa nodes. | |
272 | * MPOL_MF_MOVE_ALL: move shared pages to cpu-specific numa nodes (requires CAP_SYS_NICE). | |
273 | */ | |
ef6695f1 MD |
274 | struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len, |
275 | size_t percpu_len, int max_nr_cpus, | |
367e559c MD |
276 | int mmap_prot, int mmap_flags, int mmap_fd, |
277 | off_t mmap_offset, int numa_flags) | |
ef6695f1 MD |
278 | { |
279 | struct rseq_percpu_pool *pool; | |
280 | void *base; | |
281 | unsigned int i; | |
282 | int order; | |
ef6695f1 MD |
283 | |
284 | /* Make sure each item is large enough to contain free list pointers. */ | |
285 | if (item_len < sizeof(void *)) | |
286 | item_len = sizeof(void *); | |
287 | ||
288 | /* Align item_len on next power of two. */ | |
289 | order = get_count_order_ulong(item_len); | |
290 | if (order < 0) { | |
291 | errno = EINVAL; | |
292 | return NULL; | |
293 | } | |
294 | item_len = 1UL << order; | |
295 | ||
296 | /* Align percpu_len on page size. */ | |
367e559c | 297 | percpu_len = rseq_align(percpu_len, rseq_get_page_len()); |
ef6695f1 MD |
298 | |
299 | if (max_nr_cpus < 0 || item_len > percpu_len || | |
72b100a1 | 300 | percpu_len > (UINTPTR_MAX >> POOL_INDEX_BITS)) { |
ef6695f1 MD |
301 | errno = EINVAL; |
302 | return NULL; | |
303 | } | |
304 | ||
305 | pthread_mutex_lock(&pool_lock); | |
306 | /* Linear scan in array of pools to find empty spot. */ | |
307 | for (i = 0; i < MAX_NR_POOLS; i++) { | |
308 | pool = &rseq_percpu_pool[i]; | |
309 | if (!pool->base) | |
310 | goto found_empty; | |
311 | } | |
312 | errno = ENOMEM; | |
313 | pool = NULL; | |
314 | goto end; | |
315 | ||
316 | found_empty: | |
367e559c MD |
317 | base = mmap(NULL, percpu_len * max_nr_cpus, mmap_prot, |
318 | mmap_flags, mmap_fd, mmap_offset); | |
ef6695f1 MD |
319 | if (base == MAP_FAILED) { |
320 | pool = NULL; | |
321 | goto end; | |
322 | } | |
367e559c | 323 | rseq_percpu_pool_init_numa(pool, numa_flags); |
ef6695f1 MD |
324 | pthread_mutex_init(&pool->lock, NULL); |
325 | pool->base = base; | |
326 | pool->percpu_len = percpu_len; | |
327 | pool->max_nr_cpus = max_nr_cpus; | |
328 | pool->index = i; | |
329 | pool->item_len = item_len; | |
330 | pool->item_order = order; | |
331 | end: | |
332 | pthread_mutex_unlock(&pool_lock); | |
333 | return pool; | |
334 | } | |
335 | ||
336 | int rseq_percpu_pool_destroy(struct rseq_percpu_pool *pool) | |
337 | { | |
338 | int ret; | |
339 | ||
340 | pthread_mutex_lock(&pool_lock); | |
341 | if (!pool->base) { | |
342 | errno = ENOENT; | |
343 | ret = -1; | |
344 | goto end; | |
345 | } | |
346 | ret = munmap(pool->base, pool->percpu_len * pool->max_nr_cpus); | |
347 | if (ret) | |
348 | goto end; | |
349 | pthread_mutex_destroy(&pool->lock); | |
350 | memset(pool, 0, sizeof(*pool)); | |
351 | end: | |
352 | pthread_mutex_unlock(&pool_lock); | |
353 | return 0; | |
354 | } | |
355 | ||
ef6695f1 MD |
356 | static |
357 | void *__rseq_percpu_malloc(struct rseq_percpu_pool *pool, bool zeroed) | |
358 | { | |
359 | struct free_list_node *node; | |
360 | uintptr_t item_offset; | |
361 | void *addr; | |
362 | ||
363 | pthread_mutex_lock(&pool->lock); | |
364 | /* Get first entry from free list. */ | |
365 | node = pool->free_list_head; | |
366 | if (node != NULL) { | |
367 | /* Remove node from free list (update head). */ | |
368 | pool->free_list_head = node->next; | |
369 | item_offset = (uintptr_t) ((void *) node - pool->base); | |
72b100a1 | 370 | addr = (void *) (((uintptr_t) pool->index << POOL_INDEX_SHIFT) | item_offset); |
ef6695f1 MD |
371 | goto end; |
372 | } | |
373 | if (pool->next_unused + pool->item_len > pool->percpu_len) { | |
374 | addr = NULL; | |
375 | goto end; | |
376 | } | |
377 | item_offset = pool->next_unused; | |
72b100a1 | 378 | addr = (void *) (((uintptr_t) pool->index << POOL_INDEX_SHIFT) | item_offset); |
ef6695f1 MD |
379 | pool->next_unused += pool->item_len; |
380 | end: | |
381 | pthread_mutex_unlock(&pool->lock); | |
382 | if (zeroed && addr) | |
383 | rseq_percpu_zero_item(pool, item_offset); | |
384 | return addr; | |
385 | } | |
386 | ||
387 | void *rseq_percpu_malloc(struct rseq_percpu_pool *pool) | |
388 | { | |
389 | return __rseq_percpu_malloc(pool, false); | |
390 | } | |
391 | ||
392 | void *rseq_percpu_zmalloc(struct rseq_percpu_pool *pool) | |
393 | { | |
394 | return __rseq_percpu_malloc(pool, true); | |
395 | } | |
396 | ||
397 | void rseq_percpu_free(void *_ptr) | |
398 | { | |
399 | uintptr_t ptr = (uintptr_t) _ptr; | |
72b100a1 MD |
400 | uintptr_t item_offset = ptr & MAX_POOL_LEN_MASK; |
401 | uintptr_t pool_index = ptr >> POOL_INDEX_SHIFT; | |
ef6695f1 MD |
402 | struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index]; |
403 | struct free_list_node *head, *item; | |
404 | ||
405 | pthread_mutex_lock(&pool->lock); | |
406 | /* Add ptr to head of free list */ | |
407 | head = pool->free_list_head; | |
408 | /* Free-list is in cpu 0 range. */ | |
409 | item = (struct free_list_node *)__rseq_pool_percpu_ptr(pool, 0, item_offset); | |
410 | item->next = head; | |
411 | pool->free_list_head = item; | |
412 | pthread_mutex_unlock(&pool->lock); | |
413 | } | |
414 | ||
415 | struct rseq_percpu_pool_set *rseq_percpu_pool_set_create(void) | |
416 | { | |
417 | struct rseq_percpu_pool_set *pool_set; | |
418 | ||
419 | pool_set = calloc(1, sizeof(struct rseq_percpu_pool_set)); | |
420 | if (!pool_set) | |
421 | return NULL; | |
422 | pthread_mutex_init(&pool_set->lock, NULL); | |
423 | return pool_set; | |
424 | } | |
425 | ||
426 | int rseq_percpu_pool_set_destroy(struct rseq_percpu_pool_set *pool_set) | |
427 | { | |
428 | int order, ret; | |
429 | ||
430 | for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) { | |
431 | struct rseq_percpu_pool *pool = pool_set->entries[order]; | |
432 | ||
433 | if (!pool) | |
434 | continue; | |
435 | ret = rseq_percpu_pool_destroy(pool); | |
436 | if (ret) | |
437 | return ret; | |
438 | pool_set->entries[order] = NULL; | |
439 | } | |
440 | pthread_mutex_destroy(&pool_set->lock); | |
441 | free(pool_set); | |
442 | return 0; | |
443 | } | |
444 | ||
445 | /* Ownership of pool is handed over to pool set on success. */ | |
446 | int rseq_percpu_pool_set_add_pool(struct rseq_percpu_pool_set *pool_set, struct rseq_percpu_pool *pool) | |
447 | { | |
448 | size_t item_order = pool->item_order; | |
449 | int ret = 0; | |
450 | ||
451 | pthread_mutex_lock(&pool_set->lock); | |
452 | if (pool_set->entries[item_order]) { | |
453 | errno = EBUSY; | |
454 | ret = -1; | |
455 | goto end; | |
456 | } | |
457 | pool_set->entries[pool->item_order] = pool; | |
458 | end: | |
459 | pthread_mutex_unlock(&pool_set->lock); | |
460 | return ret; | |
461 | } | |
462 | ||
463 | static | |
464 | void *__rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len, bool zeroed) | |
465 | { | |
466 | int order, min_order = POOL_SET_MIN_ENTRY; | |
467 | struct rseq_percpu_pool *pool; | |
468 | void *addr; | |
469 | ||
470 | again: | |
471 | pthread_mutex_lock(&pool_set->lock); | |
472 | /* First smallest present pool where @len fits. */ | |
473 | for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) { | |
474 | pool = pool_set->entries[order]; | |
475 | ||
476 | if (!pool) | |
477 | continue; | |
478 | if (pool->item_len >= len) | |
479 | goto found; | |
480 | } | |
481 | pool = NULL; | |
482 | found: | |
483 | pthread_mutex_unlock(&pool_set->lock); | |
484 | if (pool) { | |
485 | addr = __rseq_percpu_malloc(pool, zeroed); | |
486 | if (addr == NULL && errno == ENOMEM) { | |
487 | /* | |
488 | * If the allocation failed, try again with a | |
489 | * larger pool. | |
490 | */ | |
491 | min_order = order + 1; | |
492 | goto again; | |
493 | } | |
494 | } else { | |
495 | /* Not found. */ | |
496 | errno = ENOMEM; | |
497 | addr = NULL; | |
498 | } | |
499 | return addr; | |
500 | } | |
501 | ||
502 | void *rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len) | |
503 | { | |
504 | return __rseq_percpu_pool_set_malloc(pool_set, len, false); | |
505 | } | |
506 | ||
507 | void *rseq_percpu_pool_set_zmalloc(struct rseq_percpu_pool_set *pool_set, size_t len) | |
508 | { | |
509 | return __rseq_percpu_pool_set_malloc(pool_set, len, true); | |
510 | } |