1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 #include <rseq/percpu-alloc.h>
11 #include <rseq/compiler.h>
17 * rseq-percpu-alloc.c: rseq per-cpu memory allocator.
19 * The rseq per-cpu memory allocator allows the application the request
20 * memory pools of per-cpu memory each of a given virtual address size
21 * per-cpu, for a given maximum number of CPUs.
24 /* Use lowest bits of per-cpu addresses to index the pool. */
25 #if RSEQ_BITS_PER_LONG == 64
26 # define OFFSET_SHIFT 16
28 # define OFFSET_SHIFT 8
30 #define MAX_NR_POOLS (1U << OFFSET_SHIFT)
31 #define POOL_MASK (MAX_NR_POOLS - 1)
33 #define POOL_SET_NR_ENTRIES (RSEQ_BITS_PER_LONG - OFFSET_SHIFT)
34 #define POOL_SET_MAX_ALLOC_LEN (1U << POOL_SET_NR_ENTRIES)
36 #if RSEQ_BITS_PER_LONG == 64
37 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
39 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
42 #define DEFAULT_PAGE_SIZE 4096
44 struct free_list_node
;
46 struct free_list_node
{
47 struct free_list_node
*next
;
50 /* This lock protects pool create/destroy. */
51 static pthread_mutex_t pool_lock
= PTHREAD_MUTEX_INITIALIZER
;
53 struct rseq_percpu_pool
{
62 * The free list chains freed items on the cpu 0 address range.
63 * We should rethink this decision if false sharing between
64 * malloc/free from other cpus and data accesses from cpu 0
65 * becomes an issue. This is a NULL-terminated singly-linked
68 struct free_list_node
*free_list_head
;
70 /* This lock protects allocation/free within the pool. */
74 //TODO: the array of pools should grow dynamically on create.
75 static struct rseq_percpu_pool rseq_percpu_pool
[MAX_NR_POOLS
];
78 * Pool set entries are indexed by item_len rounded to the next power of
79 * 2. A pool set can contain NULL pool entries, in which case the next
80 * large enough entry will be used for allocation.
82 struct rseq_percpu_pool_set
{
83 /* This lock protects add vs malloc/zmalloc within the pool set. */
85 struct rseq_percpu_pool
*entries
[POOL_SET_NR_ENTRIES
];
88 #define __rseq_align_mask(v, mask) (((v) + (mask)) & ~(mask))
89 #define rseq_align(v, align) __rseq_align_mask(v, (__typeof__(v)) (align) - 1)
91 static __attribute__((unused
))
92 unsigned int fls_u64(uint64_t x
)
99 if (!(x
& 0xFFFFFFFF00000000ULL
)) {
103 if (!(x
& 0xFFFF000000000000ULL
)) {
107 if (!(x
& 0xFF00000000000000ULL
)) {
111 if (!(x
& 0xF000000000000000ULL
)) {
115 if (!(x
& 0xC000000000000000ULL
)) {
119 if (!(x
& 0x8000000000000000ULL
)) {
126 static __attribute__((unused
))
127 unsigned int fls_u32(uint32_t x
)
133 if (!(x
& 0xFFFF0000U
)) {
137 if (!(x
& 0xFF000000U
)) {
141 if (!(x
& 0xF0000000U
)) {
145 if (!(x
& 0xC0000000U
)) {
149 if (!(x
& 0x80000000U
)) {
157 unsigned int fls_ulong(unsigned long x
)
159 #if RSEQ_BITS_PER_LONG == 32
167 * Return the minimum order for which x <= (1UL << order).
168 * Return -1 if x is 0.
171 int get_count_order_ulong(unsigned long x
)
176 return fls_ulong(x
- 1);
179 struct rseq_percpu_pool
*rseq_percpu_pool_create(size_t item_len
,
180 size_t percpu_len
, int max_nr_cpus
,
181 int prot
, int flags
, int fd
, off_t offset
)
183 struct rseq_percpu_pool
*pool
;
189 /* Make sure each item is large enough to contain free list pointers. */
190 if (item_len
< sizeof(void *))
191 item_len
= sizeof(void *);
193 /* Align item_len on next power of two. */
194 order
= get_count_order_ulong(item_len
);
199 item_len
= 1UL << order
;
201 /* Align percpu_len on page size. */
202 page_len
= sysconf(_SC_PAGE_SIZE
);
204 page_len
= DEFAULT_PAGE_SIZE
;
205 percpu_len
= rseq_align(percpu_len
, page_len
);
207 if (max_nr_cpus
< 0 || item_len
> percpu_len
||
208 percpu_len
> (UINTPTR_MAX
>> OFFSET_SHIFT
)) {
213 pthread_mutex_lock(&pool_lock
);
214 /* Linear scan in array of pools to find empty spot. */
215 for (i
= 0; i
< MAX_NR_POOLS
; i
++) {
216 pool
= &rseq_percpu_pool
[i
];
225 base
= mmap(NULL
, percpu_len
* max_nr_cpus
, prot
, flags
, fd
, offset
);
226 if (base
== MAP_FAILED
) {
230 // TODO: integrate with libnuma to provide NUMA placement hints.
231 // See move_pages(2).
232 pthread_mutex_init(&pool
->lock
, NULL
);
234 pool
->percpu_len
= percpu_len
;
235 pool
->max_nr_cpus
= max_nr_cpus
;
237 pool
->item_len
= item_len
;
238 pool
->item_order
= order
;
240 pthread_mutex_unlock(&pool_lock
);
244 int rseq_percpu_pool_destroy(struct rseq_percpu_pool
*pool
)
248 pthread_mutex_lock(&pool_lock
);
254 ret
= munmap(pool
->base
, pool
->percpu_len
* pool
->max_nr_cpus
);
257 pthread_mutex_destroy(&pool
->lock
);
258 memset(pool
, 0, sizeof(*pool
));
260 pthread_mutex_unlock(&pool_lock
);
265 void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool
*pool
, int cpu
, uintptr_t item_offset
)
267 return pool
->base
+ (pool
->percpu_len
* cpu
) + item_offset
;
270 void *__rseq_percpu_ptr(void *_ptr
, int cpu
)
272 uintptr_t ptr
= (uintptr_t) _ptr
;
273 uintptr_t item_offset
= ptr
>> OFFSET_SHIFT
;
274 uintptr_t pool_index
= ptr
& POOL_MASK
;
275 struct rseq_percpu_pool
*pool
= &rseq_percpu_pool
[pool_index
];
278 return __rseq_pool_percpu_ptr(pool
, cpu
, item_offset
);
282 void rseq_percpu_zero_item(struct rseq_percpu_pool
*pool
, uintptr_t item_offset
)
286 for (i
= 0; i
< pool
->max_nr_cpus
; i
++) {
287 char *p
= __rseq_pool_percpu_ptr(pool
, i
, item_offset
);
288 memset(p
, 0, pool
->item_len
);
293 void *__rseq_percpu_malloc(struct rseq_percpu_pool
*pool
, bool zeroed
)
295 struct free_list_node
*node
;
296 uintptr_t item_offset
;
299 pthread_mutex_lock(&pool
->lock
);
300 /* Get first entry from free list. */
301 node
= pool
->free_list_head
;
303 /* Remove node from free list (update head). */
304 pool
->free_list_head
= node
->next
;
305 item_offset
= (uintptr_t) ((void *) node
- pool
->base
);
306 addr
= (void *) ((item_offset
<< OFFSET_SHIFT
) | pool
->index
);
309 if (pool
->next_unused
+ pool
->item_len
> pool
->percpu_len
) {
313 item_offset
= pool
->next_unused
;
314 addr
= (void *) ((item_offset
<< OFFSET_SHIFT
) | pool
->index
);
315 pool
->next_unused
+= pool
->item_len
;
317 pthread_mutex_unlock(&pool
->lock
);
319 rseq_percpu_zero_item(pool
, item_offset
);
323 void *rseq_percpu_malloc(struct rseq_percpu_pool
*pool
)
325 return __rseq_percpu_malloc(pool
, false);
328 void *rseq_percpu_zmalloc(struct rseq_percpu_pool
*pool
)
330 return __rseq_percpu_malloc(pool
, true);
333 void rseq_percpu_free(void *_ptr
)
335 uintptr_t ptr
= (uintptr_t) _ptr
;
336 uintptr_t item_offset
= ptr
>> OFFSET_SHIFT
;
337 uintptr_t pool_index
= ptr
& POOL_MASK
;
338 struct rseq_percpu_pool
*pool
= &rseq_percpu_pool
[pool_index
];
339 struct free_list_node
*head
, *item
;
341 pthread_mutex_lock(&pool
->lock
);
342 /* Add ptr to head of free list */
343 head
= pool
->free_list_head
;
344 /* Free-list is in cpu 0 range. */
345 item
= (struct free_list_node
*)__rseq_pool_percpu_ptr(pool
, 0, item_offset
);
347 pool
->free_list_head
= item
;
348 pthread_mutex_unlock(&pool
->lock
);
351 struct rseq_percpu_pool_set
*rseq_percpu_pool_set_create(void)
353 struct rseq_percpu_pool_set
*pool_set
;
355 pool_set
= calloc(1, sizeof(struct rseq_percpu_pool_set
));
358 pthread_mutex_init(&pool_set
->lock
, NULL
);
362 int rseq_percpu_pool_set_destroy(struct rseq_percpu_pool_set
*pool_set
)
366 for (order
= POOL_SET_MIN_ENTRY
; order
< POOL_SET_NR_ENTRIES
; order
++) {
367 struct rseq_percpu_pool
*pool
= pool_set
->entries
[order
];
371 ret
= rseq_percpu_pool_destroy(pool
);
374 pool_set
->entries
[order
] = NULL
;
376 pthread_mutex_destroy(&pool_set
->lock
);
381 /* Ownership of pool is handed over to pool set on success. */
382 int rseq_percpu_pool_set_add_pool(struct rseq_percpu_pool_set
*pool_set
, struct rseq_percpu_pool
*pool
)
384 size_t item_order
= pool
->item_order
;
387 pthread_mutex_lock(&pool_set
->lock
);
388 if (pool_set
->entries
[item_order
]) {
393 pool_set
->entries
[pool
->item_order
] = pool
;
395 pthread_mutex_unlock(&pool_set
->lock
);
400 void *__rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set
*pool_set
, size_t len
, bool zeroed
)
402 int order
, min_order
= POOL_SET_MIN_ENTRY
;
403 struct rseq_percpu_pool
*pool
;
407 pthread_mutex_lock(&pool_set
->lock
);
408 /* First smallest present pool where @len fits. */
409 for (order
= min_order
; order
< POOL_SET_NR_ENTRIES
; order
++) {
410 pool
= pool_set
->entries
[order
];
414 if (pool
->item_len
>= len
)
419 pthread_mutex_unlock(&pool_set
->lock
);
421 addr
= __rseq_percpu_malloc(pool
, zeroed
);
422 if (addr
== NULL
&& errno
== ENOMEM
) {
424 * If the allocation failed, try again with a
427 min_order
= order
+ 1;
438 void *rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set
*pool_set
, size_t len
)
440 return __rseq_percpu_pool_set_malloc(pool_set
, len
, false);
443 void *rseq_percpu_pool_set_zmalloc(struct rseq_percpu_pool_set
*pool_set
, size_t len
)
445 return __rseq_percpu_pool_set_malloc(pool_set
, len
, true);