1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 #include <rseq/mempool.h>
11 #include <rseq/compiler.h>
22 #include "rseq-utils.h"
25 * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator.
27 * The rseq per-CPU memory allocator allows the application the request
28 * memory pools of CPU-Local memory each of containing objects of a
29 * given size (rounded to next power of 2), reserving a given virtual
30 * address size per CPU, for a given maximum number of CPUs.
32 * The per-CPU memory allocator is analogous to TLS (Thread-Local
33 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
34 * memory allocator provides CPU-Local Storage.
38 * Use high bits of per-CPU addresses to index the pool.
39 * This leaves the low bits of available to the application for pointer
40 * tagging (based on next power of 2 alignment of the allocations).
42 #if RSEQ_BITS_PER_LONG == 64
43 # define POOL_INDEX_BITS 16
45 # define POOL_INDEX_BITS 8
47 #define MAX_NR_POOLS (1UL << POOL_INDEX_BITS)
48 #define POOL_INDEX_SHIFT (RSEQ_BITS_PER_LONG - POOL_INDEX_BITS)
49 #define MAX_POOL_LEN (1UL << POOL_INDEX_SHIFT)
50 #define MAX_POOL_LEN_MASK (MAX_POOL_LEN - 1)
52 #define POOL_SET_NR_ENTRIES POOL_INDEX_SHIFT
55 * Smallest allocation should hold enough space for a free list pointer.
57 #if RSEQ_BITS_PER_LONG == 64
58 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
60 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
64 * Skip pool index 0 to ensure allocated entries at index 0 do not match
69 #define BIT_PER_ULONG (8 * sizeof(unsigned long))
71 #define MOVE_PAGES_BATCH_SIZE 4096
73 struct free_list_node
;
75 struct free_list_node
{
76 struct free_list_node
*next
;
79 /* This lock protects pool create/destroy. */
80 static pthread_mutex_t pool_lock
= PTHREAD_MUTEX_INITIALIZER
;
82 struct rseq_pool_attr
{
84 void *(*mmap_func
)(void *priv
, size_t len
);
85 int (*munmap_func
)(void *priv
, void *ptr
, size_t len
);
91 struct rseq_percpu_pool_range
;
93 struct rseq_percpu_pool_range
{
94 struct rseq_percpu_pool_range
*next
;
95 struct rseq_percpu_pool
*pool
; /* Backward ref. to container pool. */
98 /* Track alloc/free. */
99 unsigned long *alloc_bitmap
;
102 struct rseq_percpu_pool
{
103 /* Linked-list of ranges. */
104 struct rseq_percpu_pool_range
*ranges
;
113 * The free list chains freed items on the CPU 0 address range.
114 * We should rethink this decision if false sharing between
115 * malloc/free from other CPUs and data accesses from CPU 0
116 * becomes an issue. This is a NULL-terminated singly-linked
119 struct free_list_node
*free_list_head
;
121 /* This lock protects allocation/free within the pool. */
122 pthread_mutex_t lock
;
124 struct rseq_pool_attr attr
;
128 //TODO: the array of pools should grow dynamically on create.
129 static struct rseq_percpu_pool rseq_percpu_pool
[MAX_NR_POOLS
];
132 * Pool set entries are indexed by item_len rounded to the next power of
133 * 2. A pool set can contain NULL pool entries, in which case the next
134 * large enough entry will be used for allocation.
136 struct rseq_percpu_pool_set
{
137 /* This lock protects add vs malloc/zmalloc within the pool set. */
138 pthread_mutex_t lock
;
139 struct rseq_percpu_pool
*entries
[POOL_SET_NR_ENTRIES
];
143 void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool
*pool
, int cpu
, uintptr_t item_offset
)
145 /* TODO: Implement multi-ranges support. */
146 return pool
->ranges
->base
+ (pool
->percpu_len
* cpu
) + item_offset
;
149 void *__rseq_percpu_ptr(void __rseq_percpu
*_ptr
, int cpu
)
151 uintptr_t ptr
= (uintptr_t) _ptr
;
152 uintptr_t item_offset
= ptr
& MAX_POOL_LEN_MASK
;
153 uintptr_t pool_index
= ptr
>> POOL_INDEX_SHIFT
;
154 struct rseq_percpu_pool
*pool
= &rseq_percpu_pool
[pool_index
];
157 return __rseq_pool_percpu_ptr(pool
, cpu
, item_offset
);
161 void rseq_percpu_zero_item(struct rseq_percpu_pool
*pool
, uintptr_t item_offset
)
165 for (i
= 0; i
< pool
->max_nr_cpus
; i
++) {
166 char *p
= __rseq_pool_percpu_ptr(pool
, i
, item_offset
);
167 memset(p
, 0, pool
->item_len
);
171 //TODO: this will need to be reimplemented for ranges,
172 //which cannot use __rseq_pool_percpu_ptr.
173 #if 0 //#ifdef HAVE_LIBNUMA
175 int rseq_percpu_pool_range_init_numa(struct rseq_percpu_pool
*pool
, struct rseq_percpu_pool_range
*range
, int numa_flags
)
177 unsigned long nr_pages
;
183 page_len
= rseq_get_page_len();
184 nr_pages
= pool
->percpu_len
>> rseq_get_count_order_ulong(page_len
);
185 for (cpu
= 0; cpu
< pool
->max_nr_cpus
; cpu
++) {
187 int status
[MOVE_PAGES_BATCH_SIZE
];
188 int nodes
[MOVE_PAGES_BATCH_SIZE
];
189 void *pages
[MOVE_PAGES_BATCH_SIZE
];
191 nodes
[0] = numa_node_of_cpu(cpu
);
192 for (size_t k
= 1; k
< RSEQ_ARRAY_SIZE(nodes
); ++k
) {
196 for (unsigned long page
= 0; page
< nr_pages
;) {
198 size_t max_k
= RSEQ_ARRAY_SIZE(pages
);
199 size_t left
= nr_pages
- page
;
205 for (size_t k
= 0; k
< max_k
; ++k
, ++page
) {
206 pages
[k
] = __rseq_pool_percpu_ptr(pool
, cpu
, page
* page_len
);
210 ret
= move_pages(0, max_k
, pages
, nodes
, status
, numa_flags
);
216 fprintf(stderr
, "%lu pages were not migrated\n", ret
);
217 for (size_t k
= 0; k
< max_k
; ++k
) {
220 "Error while moving page %p to numa node %d: %u\n",
221 pages
[k
], nodes
[k
], -status
[k
]);
229 int rseq_percpu_pool_init_numa(struct rseq_percpu_pool
*pool
, int numa_flags
)
231 struct rseq_percpu_pool_range
*range
;
236 for (range
= pool
->ranges
; range
; range
= range
->next
) {
237 ret
= rseq_percpu_pool_range_init_numa(pool
, range
, numa_flags
);
244 int rseq_percpu_pool_init_numa(struct rseq_percpu_pool
*pool
__attribute__((unused
)),
245 int numa_flags
__attribute__((unused
)))
252 void *default_mmap_func(void *priv
__attribute__((unused
)), size_t len
)
256 base
= mmap(NULL
, len
, PROT_READ
| PROT_WRITE
,
257 MAP_ANONYMOUS
| MAP_PRIVATE
, -1, 0);
258 if (base
== MAP_FAILED
)
264 int default_munmap_func(void *priv
__attribute__((unused
)), void *ptr
, size_t len
)
266 return munmap(ptr
, len
);
270 int create_alloc_bitmap(struct rseq_percpu_pool
*pool
, struct rseq_percpu_pool_range
*range
)
274 count
= ((pool
->percpu_len
>> pool
->item_order
) + BIT_PER_ULONG
- 1) / BIT_PER_ULONG
;
277 * Not being able to create the validation bitmap is an error
278 * that needs to be reported.
280 range
->alloc_bitmap
= calloc(count
, sizeof(unsigned long));
281 if (!range
->alloc_bitmap
)
287 const char *get_pool_name(const struct rseq_percpu_pool
*pool
)
289 return pool
->name
? : "<anonymous>";
293 bool addr_in_pool(const struct rseq_percpu_pool
*pool
, void *addr
)
295 struct rseq_percpu_pool_range
*range
;
297 for (range
= pool
->ranges
; range
; range
= range
->next
) {
298 if (addr
>= range
->base
&& addr
< range
->base
+ range
->next_unused
)
304 /* Always inline for __builtin_return_address(0). */
305 static inline __attribute__((always_inline
))
306 void check_free_list(const struct rseq_percpu_pool
*pool
)
308 size_t total_item
= 0, total_never_allocated
= 0, total_freed
= 0,
309 max_list_traversal
= 0, traversal_iteration
= 0;
310 struct rseq_percpu_pool_range
*range
;
312 if (!pool
->attr
.robust_set
)
315 for (range
= pool
->ranges
; range
; range
= range
->next
) {
316 total_item
+= pool
->percpu_len
>> pool
->item_order
;
317 total_never_allocated
+= (pool
->percpu_len
- range
->next_unused
) >> pool
->item_order
;
319 max_list_traversal
= total_item
- total_never_allocated
;
321 for (struct free_list_node
*node
= pool
->free_list_head
, *prev
= NULL
;
326 void *node_addr
= node
;
328 if (traversal_iteration
>= max_list_traversal
) {
329 fprintf(stderr
, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n",
330 __func__
, get_pool_name(pool
), pool
, __builtin_return_address(0));
334 /* Node is out of range. */
335 if (!addr_in_pool(pool
, node_addr
)) {
337 fprintf(stderr
, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
338 __func__
, prev
, node
, get_pool_name(pool
), pool
, __builtin_return_address(0));
340 fprintf(stderr
, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
341 __func__
, node
, get_pool_name(pool
), pool
, __builtin_return_address(0));
345 traversal_iteration
++;
349 if (total_never_allocated
+ total_freed
!= total_item
) {
350 fprintf(stderr
, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n",
351 __func__
, get_pool_name(pool
), pool
, total_item
, total_never_allocated
, total_freed
, __builtin_return_address(0));
356 /* Always inline for __builtin_return_address(0). */
357 static inline __attribute__((always_inline
))
358 void destroy_alloc_bitmap(struct rseq_percpu_pool
*pool
, struct rseq_percpu_pool_range
*range
)
360 unsigned long *bitmap
= range
->alloc_bitmap
;
361 size_t count
, total_leaks
= 0;
366 count
= ((pool
->percpu_len
>> pool
->item_order
) + BIT_PER_ULONG
- 1) / BIT_PER_ULONG
;
368 /* Assert that all items in the pool were freed. */
369 for (size_t k
= 0; k
< count
; ++k
)
370 total_leaks
+= rseq_hweight_ulong(bitmap
[k
]);
372 fprintf(stderr
, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n",
373 __func__
, get_pool_name(pool
), pool
, total_leaks
, (void *) __builtin_return_address(0));
380 /* Always inline for __builtin_return_address(0). */
381 static inline __attribute__((always_inline
))
382 int rseq_percpu_pool_range_destroy(struct rseq_percpu_pool
*pool
,
383 struct rseq_percpu_pool_range
*range
)
385 destroy_alloc_bitmap(pool
, range
);
386 return pool
->attr
.munmap_func(pool
->attr
.mmap_priv
, range
->base
,
387 pool
->percpu_len
* pool
->max_nr_cpus
);
391 struct rseq_percpu_pool_range
*rseq_percpu_pool_range_create(struct rseq_percpu_pool
*pool
)
393 struct rseq_percpu_pool_range
*range
;
396 range
= calloc(1, sizeof(struct rseq_percpu_pool_range
));
401 base
= pool
->attr
.mmap_func(pool
->attr
.mmap_priv
, pool
->percpu_len
* pool
->max_nr_cpus
);
405 if (pool
->attr
.robust_set
) {
406 if (create_alloc_bitmap(pool
, range
))
412 (void) rseq_percpu_pool_range_destroy(pool
, range
);
416 /* Always inline for __builtin_return_address(0). */
417 static inline __attribute__((always_inline
))
418 int __rseq_percpu_pool_destroy(struct rseq_percpu_pool
*pool
)
420 struct rseq_percpu_pool_range
*range
, *next_range
;
428 check_free_list(pool
);
429 /* Iteration safe against removal. */
430 for (range
= pool
->ranges
; range
&& (next_range
= range
->next
, 1); range
= next_range
) {
431 if (rseq_percpu_pool_range_destroy(pool
, range
))
433 /* Update list head to keep list coherent in case of partial failure. */
434 pool
->ranges
= next_range
;
436 pthread_mutex_destroy(&pool
->lock
);
438 memset(pool
, 0, sizeof(*pool
));
443 int rseq_percpu_pool_destroy(struct rseq_percpu_pool
*pool
)
447 pthread_mutex_lock(&pool_lock
);
448 ret
= __rseq_percpu_pool_destroy(pool
);
449 pthread_mutex_unlock(&pool_lock
);
453 struct rseq_percpu_pool
*rseq_percpu_pool_create(const char *pool_name
,
454 size_t item_len
, size_t percpu_len
, int max_nr_cpus
,
455 const struct rseq_pool_attr
*_attr
)
457 struct rseq_percpu_pool
*pool
;
458 struct rseq_pool_attr attr
= {};
462 /* Make sure each item is large enough to contain free list pointers. */
463 if (item_len
< sizeof(void *))
464 item_len
= sizeof(void *);
466 /* Align item_len on next power of two. */
467 order
= rseq_get_count_order_ulong(item_len
);
472 item_len
= 1UL << order
;
474 /* Align percpu_len on page size. */
475 percpu_len
= rseq_align(percpu_len
, rseq_get_page_len());
477 if (max_nr_cpus
< 0 || item_len
> percpu_len
||
478 percpu_len
> (UINTPTR_MAX
>> POOL_INDEX_BITS
)) {
484 memcpy(&attr
, _attr
, sizeof(attr
));
485 if (!attr
.mmap_set
) {
486 attr
.mmap_func
= default_mmap_func
;
487 attr
.munmap_func
= default_munmap_func
;
488 attr
.mmap_priv
= NULL
;
491 pthread_mutex_lock(&pool_lock
);
492 /* Linear scan in array of pools to find empty spot. */
493 for (i
= FIRST_POOL
; i
< MAX_NR_POOLS
; i
++) {
494 pool
= &rseq_percpu_pool
[i
];
503 memcpy(&pool
->attr
, &attr
, sizeof(attr
));
504 pthread_mutex_init(&pool
->lock
, NULL
);
505 pool
->percpu_len
= percpu_len
;
506 pool
->max_nr_cpus
= max_nr_cpus
;
508 pool
->item_len
= item_len
;
509 pool
->item_order
= order
;
511 //TODO: implement multi-range support.
512 pool
->ranges
= rseq_percpu_pool_range_create(pool
);
517 pool
->name
= strdup(pool_name
);
522 pthread_mutex_unlock(&pool_lock
);
526 __rseq_percpu_pool_destroy(pool
);
527 pthread_mutex_unlock(&pool_lock
);
532 /* Always inline for __builtin_return_address(0). */
533 static inline __attribute__((always_inline
))
534 void set_alloc_slot(struct rseq_percpu_pool
*pool
, size_t item_offset
)
536 unsigned long *bitmap
= pool
->ranges
->alloc_bitmap
;
537 size_t item_index
= item_offset
>> pool
->item_order
;
544 k
= item_index
/ BIT_PER_ULONG
;
545 mask
= 1ULL << (item_index
% BIT_PER_ULONG
);
547 /* Print error if bit is already set. */
548 if (bitmap
[k
] & mask
) {
549 fprintf(stderr
, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
550 __func__
, get_pool_name(pool
), pool
, item_offset
, (void *) __builtin_return_address(0));
557 void __rseq_percpu
*__rseq_percpu_malloc(struct rseq_percpu_pool
*pool
, bool zeroed
)
559 struct free_list_node
*node
;
560 uintptr_t item_offset
;
561 void __rseq_percpu
*addr
;
563 pthread_mutex_lock(&pool
->lock
);
564 /* Get first entry from free list. */
565 node
= pool
->free_list_head
;
567 /* Remove node from free list (update head). */
568 pool
->free_list_head
= node
->next
;
569 item_offset
= (uintptr_t) ((void *) node
- pool
->ranges
->base
);
570 addr
= (void *) (((uintptr_t) pool
->index
<< POOL_INDEX_SHIFT
) | item_offset
);
573 if (pool
->ranges
->next_unused
+ pool
->item_len
> pool
->percpu_len
) {
578 item_offset
= pool
->ranges
->next_unused
;
579 addr
= (void *) (((uintptr_t) pool
->index
<< POOL_INDEX_SHIFT
) | item_offset
);
580 pool
->ranges
->next_unused
+= pool
->item_len
;
581 set_alloc_slot(pool
, item_offset
);
583 pthread_mutex_unlock(&pool
->lock
);
585 rseq_percpu_zero_item(pool
, item_offset
);
589 void __rseq_percpu
*rseq_percpu_malloc(struct rseq_percpu_pool
*pool
)
591 return __rseq_percpu_malloc(pool
, false);
594 void __rseq_percpu
*rseq_percpu_zmalloc(struct rseq_percpu_pool
*pool
)
596 return __rseq_percpu_malloc(pool
, true);
599 /* Always inline for __builtin_return_address(0). */
600 static inline __attribute__((always_inline
))
601 void clear_alloc_slot(struct rseq_percpu_pool
*pool
, size_t item_offset
)
603 unsigned long *bitmap
= pool
->ranges
->alloc_bitmap
;
604 size_t item_index
= item_offset
>> pool
->item_order
;
611 k
= item_index
/ BIT_PER_ULONG
;
612 mask
= 1ULL << (item_index
% BIT_PER_ULONG
);
614 /* Print error if bit is not set. */
615 if (!(bitmap
[k
] & mask
)) {
616 fprintf(stderr
, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
617 __func__
, get_pool_name(pool
), pool
, item_offset
,
618 (void *) __builtin_return_address(0));
624 void rseq_percpu_free(void __rseq_percpu
*_ptr
)
626 uintptr_t ptr
= (uintptr_t) _ptr
;
627 uintptr_t item_offset
= ptr
& MAX_POOL_LEN_MASK
;
628 uintptr_t pool_index
= ptr
>> POOL_INDEX_SHIFT
;
629 struct rseq_percpu_pool
*pool
= &rseq_percpu_pool
[pool_index
];
630 struct free_list_node
*head
, *item
;
632 pthread_mutex_lock(&pool
->lock
);
633 clear_alloc_slot(pool
, item_offset
);
634 /* Add ptr to head of free list */
635 head
= pool
->free_list_head
;
636 /* Free-list is in CPU 0 range. */
637 item
= (struct free_list_node
*)__rseq_pool_percpu_ptr(pool
, 0, item_offset
);
639 pool
->free_list_head
= item
;
640 pthread_mutex_unlock(&pool
->lock
);
643 struct rseq_percpu_pool_set
*rseq_percpu_pool_set_create(void)
645 struct rseq_percpu_pool_set
*pool_set
;
647 pool_set
= calloc(1, sizeof(struct rseq_percpu_pool_set
));
650 pthread_mutex_init(&pool_set
->lock
, NULL
);
654 int rseq_percpu_pool_set_destroy(struct rseq_percpu_pool_set
*pool_set
)
658 for (order
= POOL_SET_MIN_ENTRY
; order
< POOL_SET_NR_ENTRIES
; order
++) {
659 struct rseq_percpu_pool
*pool
= pool_set
->entries
[order
];
663 ret
= rseq_percpu_pool_destroy(pool
);
666 pool_set
->entries
[order
] = NULL
;
668 pthread_mutex_destroy(&pool_set
->lock
);
673 /* Ownership of pool is handed over to pool set on success. */
674 int rseq_percpu_pool_set_add_pool(struct rseq_percpu_pool_set
*pool_set
, struct rseq_percpu_pool
*pool
)
676 size_t item_order
= pool
->item_order
;
679 pthread_mutex_lock(&pool_set
->lock
);
680 if (pool_set
->entries
[item_order
]) {
685 pool_set
->entries
[pool
->item_order
] = pool
;
687 pthread_mutex_unlock(&pool_set
->lock
);
692 void __rseq_percpu
*__rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set
*pool_set
, size_t len
, bool zeroed
)
694 int order
, min_order
= POOL_SET_MIN_ENTRY
;
695 struct rseq_percpu_pool
*pool
;
696 void __rseq_percpu
*addr
;
698 order
= rseq_get_count_order_ulong(len
);
699 if (order
> POOL_SET_MIN_ENTRY
)
702 pthread_mutex_lock(&pool_set
->lock
);
703 /* First smallest present pool where @len fits. */
704 for (order
= min_order
; order
< POOL_SET_NR_ENTRIES
; order
++) {
705 pool
= pool_set
->entries
[order
];
709 if (pool
->item_len
>= len
)
714 pthread_mutex_unlock(&pool_set
->lock
);
716 addr
= __rseq_percpu_malloc(pool
, zeroed
);
717 if (addr
== NULL
&& errno
== ENOMEM
) {
719 * If the allocation failed, try again with a
722 min_order
= order
+ 1;
733 void __rseq_percpu
*rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set
*pool_set
, size_t len
)
735 return __rseq_percpu_pool_set_malloc(pool_set
, len
, false);
738 void __rseq_percpu
*rseq_percpu_pool_set_zmalloc(struct rseq_percpu_pool_set
*pool_set
, size_t len
)
740 return __rseq_percpu_pool_set_malloc(pool_set
, len
, true);
743 struct rseq_pool_attr
*rseq_pool_attr_create(void)
745 return calloc(1, sizeof(struct rseq_pool_attr
));
748 void rseq_pool_attr_destroy(struct rseq_pool_attr
*attr
)
753 int rseq_pool_attr_set_mmap(struct rseq_pool_attr
*attr
,
754 void *(*mmap_func
)(void *priv
, size_t len
),
755 int (*munmap_func
)(void *priv
, void *ptr
, size_t len
),
762 attr
->mmap_set
= true;
763 attr
->mmap_func
= mmap_func
;
764 attr
->munmap_func
= munmap_func
;
765 attr
->mmap_priv
= mmap_priv
;
769 int rseq_pool_attr_set_robust(struct rseq_pool_attr
*attr
)
775 attr
->robust_set
= true;