1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 #include <rseq/percpu-alloc.h>
11 #include <rseq/compiler.h>
23 * rseq-percpu-alloc.c: rseq per-cpu memory allocator.
25 * The rseq per-cpu memory allocator allows the application the request
26 * memory pools of per-cpu memory each of a given virtual address size
27 * per-cpu, for a given maximum number of CPUs.
30 /* Use lowest bits of per-cpu addresses to index the pool. */
31 #if RSEQ_BITS_PER_LONG == 64
32 # define OFFSET_SHIFT 16
34 # define OFFSET_SHIFT 8
36 #define MAX_NR_POOLS (1U << OFFSET_SHIFT)
37 #define POOL_MASK (MAX_NR_POOLS - 1)
39 #define POOL_SET_NR_ENTRIES (RSEQ_BITS_PER_LONG - OFFSET_SHIFT)
40 #define POOL_SET_MAX_ALLOC_LEN (1U << POOL_SET_NR_ENTRIES)
42 #if RSEQ_BITS_PER_LONG == 64
43 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
45 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
48 #define DEFAULT_PAGE_SIZE 4096
50 struct free_list_node
;
52 struct free_list_node
{
53 struct free_list_node
*next
;
56 /* This lock protects pool create/destroy. */
57 static pthread_mutex_t pool_lock
= PTHREAD_MUTEX_INITIALIZER
;
59 struct rseq_percpu_pool
{
68 * The free list chains freed items on the cpu 0 address range.
69 * We should rethink this decision if false sharing between
70 * malloc/free from other cpus and data accesses from cpu 0
71 * becomes an issue. This is a NULL-terminated singly-linked
74 struct free_list_node
*free_list_head
;
76 /* This lock protects allocation/free within the pool. */
80 //TODO: the array of pools should grow dynamically on create.
81 static struct rseq_percpu_pool rseq_percpu_pool
[MAX_NR_POOLS
];
84 * Pool set entries are indexed by item_len rounded to the next power of
85 * 2. A pool set can contain NULL pool entries, in which case the next
86 * large enough entry will be used for allocation.
88 struct rseq_percpu_pool_set
{
89 /* This lock protects add vs malloc/zmalloc within the pool set. */
91 struct rseq_percpu_pool
*entries
[POOL_SET_NR_ENTRIES
];
94 #define __rseq_align_mask(v, mask) (((v) + (mask)) & ~(mask))
95 #define rseq_align(v, align) __rseq_align_mask(v, (__typeof__(v)) (align) - 1)
97 static __attribute__((unused
))
98 unsigned int fls_u64(uint64_t x
)
105 if (!(x
& 0xFFFFFFFF00000000ULL
)) {
109 if (!(x
& 0xFFFF000000000000ULL
)) {
113 if (!(x
& 0xFF00000000000000ULL
)) {
117 if (!(x
& 0xF000000000000000ULL
)) {
121 if (!(x
& 0xC000000000000000ULL
)) {
125 if (!(x
& 0x8000000000000000ULL
)) {
132 static __attribute__((unused
))
133 unsigned int fls_u32(uint32_t x
)
139 if (!(x
& 0xFFFF0000U
)) {
143 if (!(x
& 0xFF000000U
)) {
147 if (!(x
& 0xF0000000U
)) {
151 if (!(x
& 0xC0000000U
)) {
155 if (!(x
& 0x80000000U
)) {
163 unsigned int fls_ulong(unsigned long x
)
165 #if RSEQ_BITS_PER_LONG == 32
173 * Return the minimum order for which x <= (1UL << order).
174 * Return -1 if x is 0.
177 int get_count_order_ulong(unsigned long x
)
182 return fls_ulong(x
- 1);
186 long rseq_get_page_len(void)
188 long page_len
= sysconf(_SC_PAGE_SIZE
);
191 page_len
= DEFAULT_PAGE_SIZE
;
196 void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool
*pool
, int cpu
, uintptr_t item_offset
)
198 return pool
->base
+ (pool
->percpu_len
* cpu
) + item_offset
;
201 void *__rseq_percpu_ptr(void *_ptr
, int cpu
)
203 uintptr_t ptr
= (uintptr_t) _ptr
;
204 uintptr_t item_offset
= ptr
>> OFFSET_SHIFT
;
205 uintptr_t pool_index
= ptr
& POOL_MASK
;
206 struct rseq_percpu_pool
*pool
= &rseq_percpu_pool
[pool_index
];
209 return __rseq_pool_percpu_ptr(pool
, cpu
, item_offset
);
213 void rseq_percpu_zero_item(struct rseq_percpu_pool
*pool
, uintptr_t item_offset
)
217 for (i
= 0; i
< pool
->max_nr_cpus
; i
++) {
218 char *p
= __rseq_pool_percpu_ptr(pool
, i
, item_offset
);
219 memset(p
, 0, pool
->item_len
);
225 void rseq_percpu_pool_init_numa(struct rseq_percpu_pool
*pool
,
228 unsigned long nr_pages
, page
;
234 page_len
= rseq_get_page_len();
235 nr_pages
= pool
->percpu_len
>> get_count_order_ulong(page_len
);
236 for (cpu
= 0; cpu
< pool
->max_nr_cpus
; cpu
++) {
237 int node
= numa_node_of_cpu(cpu
);
239 /* TODO: batch move_pages() call with an array of pages. */
240 for (page
= 0; page
< nr_pages
; page
++) {
241 void *pageptr
= __rseq_pool_percpu_ptr(pool
, cpu
, page
* page_len
);
244 ret
= move_pages(0, 1, &pageptr
, &node
, &status
, numa_flags
);
246 perror("move_pages");
254 void rseq_percpu_pool_init_numa(struct rseq_percpu_pool
*pool
__attribute__((unused
)),
255 int numa_flags
__attribute__((unused
)))
261 * Expected numa_flags:
262 * 0: do not move pages to specific numa nodes (use for e.g. mm_cid indexing).
263 * MPOL_MF_MOVE: move process-private pages to cpu-specific numa nodes.
264 * MPOL_MF_MOVE_ALL: move shared pages to cpu-specific numa nodes (requires CAP_SYS_NICE).
266 struct rseq_percpu_pool
*rseq_percpu_pool_create(size_t item_len
,
267 size_t percpu_len
, int max_nr_cpus
,
268 int mmap_prot
, int mmap_flags
, int mmap_fd
,
269 off_t mmap_offset
, int numa_flags
)
271 struct rseq_percpu_pool
*pool
;
276 /* Make sure each item is large enough to contain free list pointers. */
277 if (item_len
< sizeof(void *))
278 item_len
= sizeof(void *);
280 /* Align item_len on next power of two. */
281 order
= get_count_order_ulong(item_len
);
286 item_len
= 1UL << order
;
288 /* Align percpu_len on page size. */
289 percpu_len
= rseq_align(percpu_len
, rseq_get_page_len());
291 if (max_nr_cpus
< 0 || item_len
> percpu_len
||
292 percpu_len
> (UINTPTR_MAX
>> OFFSET_SHIFT
)) {
297 pthread_mutex_lock(&pool_lock
);
298 /* Linear scan in array of pools to find empty spot. */
299 for (i
= 0; i
< MAX_NR_POOLS
; i
++) {
300 pool
= &rseq_percpu_pool
[i
];
309 base
= mmap(NULL
, percpu_len
* max_nr_cpus
, mmap_prot
,
310 mmap_flags
, mmap_fd
, mmap_offset
);
311 if (base
== MAP_FAILED
) {
315 rseq_percpu_pool_init_numa(pool
, numa_flags
);
316 pthread_mutex_init(&pool
->lock
, NULL
);
318 pool
->percpu_len
= percpu_len
;
319 pool
->max_nr_cpus
= max_nr_cpus
;
321 pool
->item_len
= item_len
;
322 pool
->item_order
= order
;
324 pthread_mutex_unlock(&pool_lock
);
328 int rseq_percpu_pool_destroy(struct rseq_percpu_pool
*pool
)
332 pthread_mutex_lock(&pool_lock
);
338 ret
= munmap(pool
->base
, pool
->percpu_len
* pool
->max_nr_cpus
);
341 pthread_mutex_destroy(&pool
->lock
);
342 memset(pool
, 0, sizeof(*pool
));
344 pthread_mutex_unlock(&pool_lock
);
349 void *__rseq_percpu_malloc(struct rseq_percpu_pool
*pool
, bool zeroed
)
351 struct free_list_node
*node
;
352 uintptr_t item_offset
;
355 pthread_mutex_lock(&pool
->lock
);
356 /* Get first entry from free list. */
357 node
= pool
->free_list_head
;
359 /* Remove node from free list (update head). */
360 pool
->free_list_head
= node
->next
;
361 item_offset
= (uintptr_t) ((void *) node
- pool
->base
);
362 addr
= (void *) ((item_offset
<< OFFSET_SHIFT
) | pool
->index
);
365 if (pool
->next_unused
+ pool
->item_len
> pool
->percpu_len
) {
369 item_offset
= pool
->next_unused
;
370 addr
= (void *) ((item_offset
<< OFFSET_SHIFT
) | pool
->index
);
371 pool
->next_unused
+= pool
->item_len
;
373 pthread_mutex_unlock(&pool
->lock
);
375 rseq_percpu_zero_item(pool
, item_offset
);
379 void *rseq_percpu_malloc(struct rseq_percpu_pool
*pool
)
381 return __rseq_percpu_malloc(pool
, false);
384 void *rseq_percpu_zmalloc(struct rseq_percpu_pool
*pool
)
386 return __rseq_percpu_malloc(pool
, true);
389 void rseq_percpu_free(void *_ptr
)
391 uintptr_t ptr
= (uintptr_t) _ptr
;
392 uintptr_t item_offset
= ptr
>> OFFSET_SHIFT
;
393 uintptr_t pool_index
= ptr
& POOL_MASK
;
394 struct rseq_percpu_pool
*pool
= &rseq_percpu_pool
[pool_index
];
395 struct free_list_node
*head
, *item
;
397 pthread_mutex_lock(&pool
->lock
);
398 /* Add ptr to head of free list */
399 head
= pool
->free_list_head
;
400 /* Free-list is in cpu 0 range. */
401 item
= (struct free_list_node
*)__rseq_pool_percpu_ptr(pool
, 0, item_offset
);
403 pool
->free_list_head
= item
;
404 pthread_mutex_unlock(&pool
->lock
);
407 struct rseq_percpu_pool_set
*rseq_percpu_pool_set_create(void)
409 struct rseq_percpu_pool_set
*pool_set
;
411 pool_set
= calloc(1, sizeof(struct rseq_percpu_pool_set
));
414 pthread_mutex_init(&pool_set
->lock
, NULL
);
418 int rseq_percpu_pool_set_destroy(struct rseq_percpu_pool_set
*pool_set
)
422 for (order
= POOL_SET_MIN_ENTRY
; order
< POOL_SET_NR_ENTRIES
; order
++) {
423 struct rseq_percpu_pool
*pool
= pool_set
->entries
[order
];
427 ret
= rseq_percpu_pool_destroy(pool
);
430 pool_set
->entries
[order
] = NULL
;
432 pthread_mutex_destroy(&pool_set
->lock
);
437 /* Ownership of pool is handed over to pool set on success. */
438 int rseq_percpu_pool_set_add_pool(struct rseq_percpu_pool_set
*pool_set
, struct rseq_percpu_pool
*pool
)
440 size_t item_order
= pool
->item_order
;
443 pthread_mutex_lock(&pool_set
->lock
);
444 if (pool_set
->entries
[item_order
]) {
449 pool_set
->entries
[pool
->item_order
] = pool
;
451 pthread_mutex_unlock(&pool_set
->lock
);
456 void *__rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set
*pool_set
, size_t len
, bool zeroed
)
458 int order
, min_order
= POOL_SET_MIN_ENTRY
;
459 struct rseq_percpu_pool
*pool
;
463 pthread_mutex_lock(&pool_set
->lock
);
464 /* First smallest present pool where @len fits. */
465 for (order
= min_order
; order
< POOL_SET_NR_ENTRIES
; order
++) {
466 pool
= pool_set
->entries
[order
];
470 if (pool
->item_len
>= len
)
475 pthread_mutex_unlock(&pool_set
->lock
);
477 addr
= __rseq_percpu_malloc(pool
, zeroed
);
478 if (addr
== NULL
&& errno
== ENOMEM
) {
480 * If the allocation failed, try again with a
483 min_order
= order
+ 1;
494 void *rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set
*pool_set
, size_t len
)
496 return __rseq_percpu_pool_set_malloc(pool_set
, len
, false);
499 void *rseq_percpu_pool_set_zmalloc(struct rseq_percpu_pool_set
*pool_set
, size_t len
)
501 return __rseq_percpu_pool_set_malloc(pool_set
, len
, true);