1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 #include <rseq/mempool.h>
11 #include <rseq/compiler.h>
23 #include "rseq-utils.h"
24 #include <rseq/rseq.h>
27 * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator.
29 * The rseq per-CPU memory allocator allows the application the request
30 * memory pools of CPU-Local memory each of containing objects of a
31 * given size (rounded to next power of 2), reserving a given virtual
32 * address size per CPU, for a given maximum number of CPUs.
34 * The per-CPU memory allocator is analogous to TLS (Thread-Local
35 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
36 * memory allocator provides CPU-Local Storage.
39 #define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG
42 * Smallest allocation should hold enough space for a free list pointer.
44 #if RSEQ_BITS_PER_LONG == 64
45 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
47 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
50 #define BIT_PER_ULONG (8 * sizeof(unsigned long))
52 #define MOVE_PAGES_BATCH_SIZE 4096
54 #define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range)
56 #if RSEQ_BITS_PER_LONG == 64
57 # define DEFAULT_POISON_VALUE 0x5555555555555555ULL
59 # define DEFAULT_POISON_VALUE 0x55555555UL
62 struct free_list_node
;
64 struct free_list_node
{
65 struct free_list_node
*next
;
69 MEMPOOL_TYPE_GLOBAL
= 0, /* Default */
70 MEMPOOL_TYPE_PERCPU
= 1,
73 struct rseq_mempool_attr
{
75 void *(*mmap_func
)(void *priv
, size_t len
);
76 int (*munmap_func
)(void *priv
, void *ptr
, size_t len
);
80 int (*init_func
)(void *priv
, void *addr
, size_t len
, int cpu
);
85 enum mempool_type type
;
89 unsigned long max_nr_ranges
;
94 enum rseq_mempool_populate_policy populate_policy
;
97 struct rseq_mempool_range
;
99 struct rseq_mempool_range
{
100 struct rseq_mempool_range
*next
; /* Linked list of ranges. */
101 struct rseq_mempool
*pool
; /* Backward reference to container pool. */
104 * Memory layout of a mempool range:
105 * - Header page (contains struct rseq_mempool_range at the very end),
106 * - Base of the per-cpu data, starting with CPU 0.
107 * Aliases with free-list for non-robust populate all pool.
110 * - CPU max_nr_cpus - 1
111 * - init values (unpopulated for RSEQ_MEMPOOL_POPULATE_ALL).
112 * Aliases with free-list for non-robust populate none pool.
113 * - free list (for robust pool).
115 * The free list aliases the CPU 0 memory area for non-robust
116 * populate all pools. It aliases with init values for
117 * non-robust populate none pools. It is located immediately
118 * after the init values for robust pools.
123 * The init values contains malloc_init/zmalloc values.
124 * Pointer is NULL for RSEQ_MEMPOOL_POPULATE_ALL.
129 /* Pool range mmap/munmap */
133 /* Track alloc/free. */
134 unsigned long *alloc_bitmap
;
137 struct rseq_mempool
{
138 /* Head of ranges linked-list. */
139 struct rseq_mempool_range
*range_list
;
140 unsigned long nr_ranges
;
146 * The free list chains freed items on the CPU 0 address range.
147 * We should rethink this decision if false sharing between
148 * malloc/free from other CPUs and data accesses from CPU 0
149 * becomes an issue. This is a NULL-terminated singly-linked
152 struct free_list_node
*free_list_head
;
154 /* This lock protects allocation/free within the pool. */
155 pthread_mutex_t lock
;
157 struct rseq_mempool_attr attr
;
162 * Pool set entries are indexed by item_len rounded to the next power of
163 * 2. A pool set can contain NULL pool entries, in which case the next
164 * large enough entry will be used for allocation.
166 struct rseq_mempool_set
{
167 /* This lock protects add vs malloc/zmalloc within the pool set. */
168 pthread_mutex_t lock
;
169 struct rseq_mempool
*entries
[POOL_SET_NR_ENTRIES
];
173 * This memfd is used to implement the user COW behavior for the page
174 * protection scheme. memfd is a sparse virtual file. Its layout (in
175 * offset from beginning of file) matches the process address space
176 * (pointers directly converted to file offsets).
179 pthread_mutex_t lock
;
180 size_t reserved_size
;
181 unsigned int refcount
;
185 static struct rseq_memfd memfd
= {
186 .lock
= PTHREAD_MUTEX_INITIALIZER
,
193 const char *get_pool_name(const struct rseq_mempool
*pool
)
195 return pool
->name
? : "<anonymous>";
199 void *__rseq_pool_range_percpu_ptr(const struct rseq_mempool_range
*range
, int cpu
,
200 uintptr_t item_offset
, size_t stride
)
202 return range
->base
+ (stride
* cpu
) + item_offset
;
206 void *__rseq_pool_range_init_ptr(const struct rseq_mempool_range
*range
,
207 uintptr_t item_offset
)
211 return range
->init
+ item_offset
;
215 void __rseq_percpu
*__rseq_free_list_to_percpu_ptr(const struct rseq_mempool
*pool
,
216 struct free_list_node
*node
)
218 void __rseq_percpu
*p
= (void __rseq_percpu
*) node
;
220 if (pool
->attr
.robust_set
) {
222 p
-= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
223 /* Skip init values */
224 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_ALL
)
225 p
-= pool
->attr
.stride
;
228 /* Populate none free list is in init values */
229 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_ALL
)
230 p
-= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
236 struct free_list_node
*__rseq_percpu_to_free_list_ptr(const struct rseq_mempool
*pool
,
237 void __rseq_percpu
*p
)
239 if (pool
->attr
.robust_set
) {
241 p
+= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
242 /* Skip init values */
243 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_ALL
)
244 p
+= pool
->attr
.stride
;
247 /* Populate none free list is in init values */
248 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_ALL
)
249 p
+= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
251 return (struct free_list_node
*) p
;
255 off_t
ptr_to_off_t(void *p
)
257 return (off_t
) (uintptr_t) p
;
261 int memcmpbyte(const char *s
, int c
, size_t n
)
266 if ((res
= *(s
++) - c
) != 0)
272 void rseq_percpu_zero_item(struct rseq_mempool
*pool
,
273 struct rseq_mempool_range
*range
, uintptr_t item_offset
)
278 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
280 memset(init_p
, 0, pool
->item_len
);
281 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
282 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
283 item_offset
, pool
->attr
.stride
);
285 /* Update propagated */
286 if (init_p
&& !memcmpbyte(p
, 0, pool
->item_len
))
288 memset(p
, 0, pool
->item_len
);
293 void rseq_percpu_init_item(struct rseq_mempool
*pool
,
294 struct rseq_mempool_range
*range
, uintptr_t item_offset
,
295 void *init_ptr
, size_t init_len
)
300 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
302 memcpy(init_p
, init_ptr
, init_len
);
303 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
304 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
305 item_offset
, pool
->attr
.stride
);
307 /* Update propagated */
308 if (init_p
&& !memcmp(init_p
, p
, init_len
))
310 memcpy(p
, init_ptr
, init_len
);
315 void rseq_poison_item(void *p
, size_t item_len
, uintptr_t poison
)
319 for (offset
= 0; offset
< item_len
; offset
+= sizeof(uintptr_t))
320 *((uintptr_t *) (p
+ offset
)) = poison
;
324 void rseq_percpu_poison_item(struct rseq_mempool
*pool
,
325 struct rseq_mempool_range
*range
, uintptr_t item_offset
)
327 uintptr_t poison
= pool
->attr
.poison
;
331 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
333 rseq_poison_item(init_p
, pool
->item_len
, poison
);
334 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
335 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
336 item_offset
, pool
->attr
.stride
);
338 /* Update propagated */
339 if (init_p
&& !memcmp(init_p
, p
, pool
->item_len
))
341 rseq_poison_item(p
, pool
->item_len
, poison
);
345 /* Always inline for __builtin_return_address(0). */
346 static inline __attribute__((always_inline
))
347 void rseq_check_poison_item(const struct rseq_mempool
*pool
, uintptr_t item_offset
,
348 void *p
, size_t item_len
, uintptr_t poison
)
352 for (offset
= 0; offset
< item_len
; offset
+= sizeof(uintptr_t)) {
355 v
= *((uintptr_t *) (p
+ offset
));
357 fprintf(stderr
, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
358 __func__
, (unsigned long) v
, get_pool_name(pool
), pool
, item_offset
, (void *) __builtin_return_address(0));
364 /* Always inline for __builtin_return_address(0). */
365 static inline __attribute__((always_inline
))
366 void rseq_percpu_check_poison_item(const struct rseq_mempool
*pool
,
367 const struct rseq_mempool_range
*range
, uintptr_t item_offset
)
369 uintptr_t poison
= pool
->attr
.poison
;
373 if (!pool
->attr
.robust_set
)
375 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
377 rseq_check_poison_item(pool
, item_offset
, init_p
, pool
->item_len
, poison
);
378 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
379 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
380 item_offset
, pool
->attr
.stride
);
381 rseq_check_poison_item(pool
, item_offset
, p
, pool
->item_len
, poison
);
386 int rseq_mempool_range_init_numa(void *addr
, size_t len
, int cpu
, int numa_flags
)
388 unsigned long nr_pages
, page_len
;
389 int status
[MOVE_PAGES_BATCH_SIZE
];
390 int nodes
[MOVE_PAGES_BATCH_SIZE
];
391 void *pages
[MOVE_PAGES_BATCH_SIZE
];
398 page_len
= rseq_get_page_len();
399 nr_pages
= len
>> rseq_get_count_order_ulong(page_len
);
401 nodes
[0] = numa_node_of_cpu(cpu
);
405 for (size_t k
= 1; k
< RSEQ_ARRAY_SIZE(nodes
); ++k
) {
409 for (unsigned long page
= 0; page
< nr_pages
;) {
411 size_t max_k
= RSEQ_ARRAY_SIZE(pages
);
412 size_t left
= nr_pages
- page
;
418 for (size_t k
= 0; k
< max_k
; ++k
, ++page
) {
419 pages
[k
] = addr
+ (page
* page_len
);
423 ret
= move_pages(0, max_k
, pages
, nodes
, status
, numa_flags
);
429 fprintf(stderr
, "%lu pages were not migrated\n", ret
);
430 for (size_t k
= 0; k
< max_k
; ++k
) {
433 "Error while moving page %p to numa node %d: %u\n",
434 pages
[k
], nodes
[k
], -status
[k
]);
441 int rseq_mempool_range_init_numa(void *addr
__attribute__((unused
)),
442 size_t len
__attribute__((unused
)),
443 int cpu
__attribute__((unused
)),
444 int numa_flags
__attribute__((unused
)))
452 void *default_mmap_func(void *priv
__attribute__((unused
)), size_t len
)
456 base
= mmap(NULL
, len
, PROT_READ
| PROT_WRITE
,
457 MAP_ANONYMOUS
| MAP_PRIVATE
, -1, 0);
458 if (base
== MAP_FAILED
)
464 int default_munmap_func(void *priv
__attribute__((unused
)), void *ptr
, size_t len
)
466 return munmap(ptr
, len
);
470 int create_alloc_bitmap(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
)
474 count
= ((pool
->attr
.stride
>> pool
->item_order
) + BIT_PER_ULONG
- 1) / BIT_PER_ULONG
;
477 * Not being able to create the validation bitmap is an error
478 * that needs to be reported.
480 range
->alloc_bitmap
= calloc(count
, sizeof(unsigned long));
481 if (!range
->alloc_bitmap
)
487 bool percpu_addr_in_pool(const struct rseq_mempool
*pool
, void __rseq_percpu
*_addr
)
489 struct rseq_mempool_range
*range
;
490 void *addr
= (void *) _addr
;
492 for (range
= pool
->range_list
; range
; range
= range
->next
) {
493 if (addr
>= range
->base
&& addr
< range
->base
+ range
->next_unused
)
499 /* Always inline for __builtin_return_address(0). */
500 static inline __attribute__((always_inline
))
501 void check_free_list(const struct rseq_mempool
*pool
)
503 size_t total_item
= 0, total_never_allocated
= 0, total_freed
= 0,
504 max_list_traversal
= 0, traversal_iteration
= 0;
505 struct rseq_mempool_range
*range
;
507 if (!pool
->attr
.robust_set
)
510 for (range
= pool
->range_list
; range
; range
= range
->next
) {
511 total_item
+= pool
->attr
.stride
>> pool
->item_order
;
512 total_never_allocated
+= (pool
->attr
.stride
- range
->next_unused
) >> pool
->item_order
;
514 max_list_traversal
= total_item
- total_never_allocated
;
516 for (struct free_list_node
*node
= pool
->free_list_head
, *prev
= NULL
;
521 if (traversal_iteration
>= max_list_traversal
) {
522 fprintf(stderr
, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n",
523 __func__
, get_pool_name(pool
), pool
, __builtin_return_address(0));
527 /* Node is out of range. */
528 if (!percpu_addr_in_pool(pool
, __rseq_free_list_to_percpu_ptr(pool
, node
))) {
530 fprintf(stderr
, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
531 __func__
, prev
, node
, get_pool_name(pool
), pool
, __builtin_return_address(0));
533 fprintf(stderr
, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
534 __func__
, node
, get_pool_name(pool
), pool
, __builtin_return_address(0));
538 traversal_iteration
++;
542 if (total_never_allocated
+ total_freed
!= total_item
) {
543 fprintf(stderr
, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n",
544 __func__
, get_pool_name(pool
), pool
, total_item
, total_never_allocated
, total_freed
, __builtin_return_address(0));
549 /* Always inline for __builtin_return_address(0). */
550 static inline __attribute__((always_inline
))
551 void check_range_poison(const struct rseq_mempool
*pool
,
552 const struct rseq_mempool_range
*range
)
556 for (item_offset
= 0; item_offset
< range
->next_unused
;
557 item_offset
+= pool
->item_len
)
558 rseq_percpu_check_poison_item(pool
, range
, item_offset
);
561 /* Always inline for __builtin_return_address(0). */
562 static inline __attribute__((always_inline
))
563 void check_pool_poison(const struct rseq_mempool
*pool
)
565 struct rseq_mempool_range
*range
;
567 if (!pool
->attr
.robust_set
)
569 for (range
= pool
->range_list
; range
; range
= range
->next
)
570 check_range_poison(pool
, range
);
573 /* Always inline for __builtin_return_address(0). */
574 static inline __attribute__((always_inline
))
575 void destroy_alloc_bitmap(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
)
577 unsigned long *bitmap
= range
->alloc_bitmap
;
578 size_t count
, total_leaks
= 0;
583 count
= ((pool
->attr
.stride
>> pool
->item_order
) + BIT_PER_ULONG
- 1) / BIT_PER_ULONG
;
585 /* Assert that all items in the pool were freed. */
586 for (size_t k
= 0; k
< count
; ++k
)
587 total_leaks
+= rseq_hweight_ulong(bitmap
[k
]);
589 fprintf(stderr
, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n",
590 __func__
, get_pool_name(pool
), pool
, total_leaks
, (void *) __builtin_return_address(0));
595 range
->alloc_bitmap
= NULL
;
598 /* Always inline for __builtin_return_address(0). */
599 static inline __attribute__((always_inline
))
600 int rseq_mempool_range_destroy(struct rseq_mempool
*pool
,
601 struct rseq_mempool_range
*range
)
605 destroy_alloc_bitmap(pool
, range
);
608 * Punch a hole into memfd where the init values used to be.
611 ret
= fallocate(memfd
.fd
, FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
,
612 ptr_to_off_t(range
->init
), pool
->attr
.stride
);
618 /* range is a header located one page before the aligned mapping. */
619 return pool
->attr
.munmap_func(pool
->attr
.mmap_priv
, range
->mmap_addr
, range
->mmap_len
);
623 * Allocate a memory mapping aligned on @alignment, with an optional
624 * @pre_header before the mapping.
627 void *aligned_mmap_anonymous(struct rseq_mempool
*pool
,
628 size_t page_size
, size_t len
, size_t alignment
,
629 void **pre_header
, size_t pre_header_len
)
631 size_t minimum_page_count
, page_count
, extra
, total_allocate
= 0;
635 if (len
< page_size
|| alignment
< page_size
||
636 !is_pow2(alignment
) || (len
& (alignment
- 1))) {
640 page_order
= rseq_get_count_order_ulong(page_size
);
641 if (page_order
< 0) {
645 if (pre_header_len
&& (pre_header_len
& (page_size
- 1))) {
650 minimum_page_count
= (pre_header_len
+ len
) >> page_order
;
651 page_count
= (pre_header_len
+ len
+ alignment
- page_size
) >> page_order
;
653 assert(page_count
>= minimum_page_count
);
655 ptr
= pool
->attr
.mmap_func(pool
->attr
.mmap_priv
, page_count
<< page_order
);
659 total_allocate
= page_count
<< page_order
;
661 if (!(((uintptr_t) ptr
+ pre_header_len
) & (alignment
- 1))) {
662 /* Pointer is already aligned. ptr points to pre_header. */
666 /* Unmap extra before. */
667 extra
= offset_align((uintptr_t) ptr
+ pre_header_len
, alignment
);
668 assert(!(extra
& (page_size
- 1)));
669 if (pool
->attr
.munmap_func(pool
->attr
.mmap_priv
, ptr
, extra
)) {
673 total_allocate
-= extra
;
674 ptr
+= extra
; /* ptr points to pre_header */
675 page_count
-= extra
>> page_order
;
677 assert(page_count
>= minimum_page_count
);
679 if (page_count
> minimum_page_count
) {
682 /* Unmap extra after. */
683 extra_ptr
= ptr
+ (minimum_page_count
<< page_order
);
684 extra
= (page_count
- minimum_page_count
) << page_order
;
685 if (pool
->attr
.munmap_func(pool
->attr
.mmap_priv
, extra_ptr
, extra
)) {
689 total_allocate
-= extra
;
692 assert(!(((uintptr_t)ptr
+ pre_header_len
) & (alignment
- 1)));
693 assert(total_allocate
== len
+ pre_header_len
);
699 ptr
+= pre_header_len
;
705 int rseq_memfd_reserve_init(void *init
, size_t init_len
)
710 pthread_mutex_lock(&memfd
.lock
);
711 reserve_len
= (size_t) ptr_to_off_t(init
) + init_len
;
712 if (reserve_len
> memfd
.reserved_size
) {
713 if (ftruncate(memfd
.fd
, (off_t
) reserve_len
)) {
717 memfd
.reserved_size
= reserve_len
;
720 pthread_mutex_unlock(&memfd
.lock
);
725 struct rseq_mempool_range
*rseq_mempool_range_create(struct rseq_mempool
*pool
)
727 struct rseq_mempool_range
*range
;
728 unsigned long page_size
;
731 size_t range_len
; /* Range len excludes header. */
733 if (pool
->attr
.max_nr_ranges
&&
734 pool
->nr_ranges
>= pool
->attr
.max_nr_ranges
) {
738 page_size
= rseq_get_page_len();
740 range_len
= pool
->attr
.stride
* pool
->attr
.max_nr_cpus
;
741 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_ALL
)
742 range_len
+= pool
->attr
.stride
; /* init values */
743 if (pool
->attr
.robust_set
)
744 range_len
+= pool
->attr
.stride
; /* free list */
745 base
= aligned_mmap_anonymous(pool
, page_size
,
751 range
= (struct rseq_mempool_range
*) (base
- RANGE_HEADER_OFFSET
);
753 range
->header
= header
;
755 range
->mmap_addr
= header
;
756 range
->mmap_len
= page_size
+ range_len
;
758 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_ALL
) {
759 range
->init
= base
+ (pool
->attr
.stride
* pool
->attr
.max_nr_cpus
);
760 /* Populate init values pages from memfd */
761 if (rseq_memfd_reserve_init(range
->init
, pool
->attr
.stride
))
763 if (mmap(range
->init
, pool
->attr
.stride
, PROT_READ
| PROT_WRITE
,
764 MAP_SHARED
| MAP_FIXED
, memfd
.fd
,
765 ptr_to_off_t(range
->init
)) != (void *) range
->init
) {
768 assert(pool
->attr
.type
== MEMPOOL_TYPE_PERCPU
);
770 * Map per-cpu memory as private COW mappings of init values.
775 for (cpu
= 0; cpu
< pool
->attr
.max_nr_cpus
; cpu
++) {
776 void *p
= base
+ (pool
->attr
.stride
* cpu
);
777 size_t len
= pool
->attr
.stride
;
779 if (mmap(p
, len
, PROT_READ
| PROT_WRITE
, MAP_PRIVATE
| MAP_FIXED
,
780 memfd
.fd
, ptr_to_off_t(range
->init
)) != (void *) p
) {
787 if (pool
->attr
.robust_set
) {
788 if (create_alloc_bitmap(pool
, range
))
791 if (pool
->attr
.init_set
) {
792 switch (pool
->attr
.type
) {
793 case MEMPOOL_TYPE_GLOBAL
:
794 if (pool
->attr
.init_func(pool
->attr
.init_priv
,
795 base
, pool
->attr
.stride
, -1)) {
799 case MEMPOOL_TYPE_PERCPU
:
802 for (cpu
= 0; cpu
< pool
->attr
.max_nr_cpus
; cpu
++) {
803 if (pool
->attr
.init_func(pool
->attr
.init_priv
,
804 base
+ (pool
->attr
.stride
* cpu
),
805 pool
->attr
.stride
, cpu
)) {
819 (void) rseq_mempool_range_destroy(pool
, range
);
824 int rseq_mempool_memfd_ref(struct rseq_mempool
*pool
)
828 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_ALL
)
831 pthread_mutex_lock(&memfd
.lock
);
832 if (memfd
.refcount
== 0) {
833 memfd
.fd
= memfd_create("mempool", MFD_CLOEXEC
);
835 perror("memfd_create");
842 pthread_mutex_unlock(&memfd
.lock
);
847 void rseq_mempool_memfd_unref(struct rseq_mempool
*pool
)
849 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_ALL
)
852 pthread_mutex_lock(&memfd
.lock
);
853 if (memfd
.refcount
== 1) {
854 if (close(memfd
.fd
)) {
859 memfd
.reserved_size
= 0;
862 pthread_mutex_unlock(&memfd
.lock
);
865 int rseq_mempool_destroy(struct rseq_mempool
*pool
)
867 struct rseq_mempool_range
*range
, *next_range
;
872 check_free_list(pool
);
873 check_pool_poison(pool
);
874 /* Iteration safe against removal. */
875 for (range
= pool
->range_list
; range
&& (next_range
= range
->next
, 1); range
= next_range
) {
876 if (rseq_mempool_range_destroy(pool
, range
))
878 /* Update list head to keep list coherent in case of partial failure. */
879 pool
->range_list
= next_range
;
881 rseq_mempool_memfd_unref(pool
);
882 pthread_mutex_destroy(&pool
->lock
);
889 struct rseq_mempool
*rseq_mempool_create(const char *pool_name
,
890 size_t item_len
, const struct rseq_mempool_attr
*_attr
)
892 struct rseq_mempool
*pool
;
893 struct rseq_mempool_attr attr
= {};
896 /* Make sure each item is large enough to contain free list pointers. */
897 if (item_len
< sizeof(void *))
898 item_len
= sizeof(void *);
900 /* Align item_len on next power of two. */
901 order
= rseq_get_count_order_ulong(item_len
);
906 item_len
= 1UL << order
;
909 memcpy(&attr
, _attr
, sizeof(attr
));
910 if (!attr
.mmap_set
) {
911 attr
.mmap_func
= default_mmap_func
;
912 attr
.munmap_func
= default_munmap_func
;
913 attr
.mmap_priv
= NULL
;
917 case MEMPOOL_TYPE_PERCPU
:
918 if (attr
.max_nr_cpus
< 0) {
922 if (attr
.max_nr_cpus
== 0) {
924 attr
.max_nr_cpus
= rseq_get_max_nr_cpus();
925 if (attr
.max_nr_cpus
== 0) {
931 case MEMPOOL_TYPE_GLOBAL
:
932 /* Override populate policy for global type. */
933 attr
.populate_policy
= RSEQ_MEMPOOL_POPULATE_ALL
;
934 /* Use a 1-cpu pool for global mempool type. */
935 attr
.max_nr_cpus
= 1;
939 attr
.stride
= RSEQ_MEMPOOL_STRIDE
; /* Use default */
940 if (attr
.robust_set
&& !attr
.poison_set
) {
941 attr
.poison_set
= true;
942 attr
.poison
= DEFAULT_POISON_VALUE
;
944 if (item_len
> attr
.stride
|| attr
.stride
< (size_t) rseq_get_page_len() ||
945 !is_pow2(attr
.stride
)) {
950 pool
= calloc(1, sizeof(struct rseq_mempool
));
954 memcpy(&pool
->attr
, &attr
, sizeof(attr
));
955 pthread_mutex_init(&pool
->lock
, NULL
);
956 pool
->item_len
= item_len
;
957 pool
->item_order
= order
;
959 if (rseq_mempool_memfd_ref(pool
))
962 pool
->range_list
= rseq_mempool_range_create(pool
);
963 if (!pool
->range_list
)
967 pool
->name
= strdup(pool_name
);
974 rseq_mempool_destroy(pool
);
979 /* Always inline for __builtin_return_address(0). */
980 static inline __attribute__((always_inline
))
981 void set_alloc_slot(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
, size_t item_offset
)
983 unsigned long *bitmap
= range
->alloc_bitmap
;
984 size_t item_index
= item_offset
>> pool
->item_order
;
991 k
= item_index
/ BIT_PER_ULONG
;
992 mask
= 1ULL << (item_index
% BIT_PER_ULONG
);
994 /* Print error if bit is already set. */
995 if (bitmap
[k
] & mask
) {
996 fprintf(stderr
, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
997 __func__
, get_pool_name(pool
), pool
, item_offset
, (void *) __builtin_return_address(0));
1004 void __rseq_percpu
*__rseq_percpu_malloc(struct rseq_mempool
*pool
,
1005 bool zeroed
, void *init_ptr
, size_t init_len
)
1007 struct rseq_mempool_range
*range
;
1008 struct free_list_node
*node
;
1009 uintptr_t item_offset
;
1010 void __rseq_percpu
*addr
;
1012 if (init_len
> pool
->item_len
) {
1016 pthread_mutex_lock(&pool
->lock
);
1017 /* Get first entry from free list. */
1018 node
= pool
->free_list_head
;
1020 void *range_base
, *ptr
;
1022 ptr
= __rseq_free_list_to_percpu_ptr(pool
, node
);
1023 range_base
= (void *) ((uintptr_t) ptr
& (~(pool
->attr
.stride
- 1)));
1024 range
= (struct rseq_mempool_range
*) (range_base
- RANGE_HEADER_OFFSET
);
1025 /* Remove node from free list (update head). */
1026 pool
->free_list_head
= node
->next
;
1027 item_offset
= (uintptr_t) (ptr
- range_base
);
1028 rseq_percpu_check_poison_item(pool
, range
, item_offset
);
1029 addr
= __rseq_free_list_to_percpu_ptr(pool
, node
);
1033 * If the most recent range (first in list) does not have any
1034 * room left, create a new range and prepend it to the list
1037 range
= pool
->range_list
;
1038 if (range
->next_unused
+ pool
->item_len
> pool
->attr
.stride
) {
1039 range
= rseq_mempool_range_create(pool
);
1045 /* Add range to head of list. */
1046 range
->next
= pool
->range_list
;
1047 pool
->range_list
= range
;
1049 /* First range in list has room left. */
1050 item_offset
= range
->next_unused
;
1051 addr
= (void __rseq_percpu
*) (range
->base
+ item_offset
);
1052 range
->next_unused
+= pool
->item_len
;
1055 set_alloc_slot(pool
, range
, item_offset
);
1056 pthread_mutex_unlock(&pool
->lock
);
1059 rseq_percpu_zero_item(pool
, range
, item_offset
);
1060 else if (init_ptr
) {
1061 rseq_percpu_init_item(pool
, range
, item_offset
,
1062 init_ptr
, init_len
);
1068 void __rseq_percpu
*rseq_mempool_percpu_malloc(struct rseq_mempool
*pool
)
1070 return __rseq_percpu_malloc(pool
, false, NULL
, 0);
1073 void __rseq_percpu
*rseq_mempool_percpu_zmalloc(struct rseq_mempool
*pool
)
1075 return __rseq_percpu_malloc(pool
, true, NULL
, 0);
1078 void __rseq_percpu
*rseq_mempool_percpu_malloc_init(struct rseq_mempool
*pool
,
1079 void *init_ptr
, size_t len
)
1081 return __rseq_percpu_malloc(pool
, false, init_ptr
, len
);
1084 /* Always inline for __builtin_return_address(0). */
1085 static inline __attribute__((always_inline
))
1086 void clear_alloc_slot(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
, size_t item_offset
)
1088 unsigned long *bitmap
= range
->alloc_bitmap
;
1089 size_t item_index
= item_offset
>> pool
->item_order
;
1096 k
= item_index
/ BIT_PER_ULONG
;
1097 mask
= 1ULL << (item_index
% BIT_PER_ULONG
);
1099 /* Print error if bit is not set. */
1100 if (!(bitmap
[k
] & mask
)) {
1101 fprintf(stderr
, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
1102 __func__
, get_pool_name(pool
), pool
, item_offset
,
1103 (void *) __builtin_return_address(0));
1109 void librseq_mempool_percpu_free(void __rseq_percpu
*_ptr
, size_t stride
)
1111 uintptr_t ptr
= (uintptr_t) _ptr
;
1112 void *range_base
= (void *) (ptr
& (~(stride
- 1)));
1113 struct rseq_mempool_range
*range
= (struct rseq_mempool_range
*) (range_base
- RANGE_HEADER_OFFSET
);
1114 struct rseq_mempool
*pool
= range
->pool
;
1115 uintptr_t item_offset
= ptr
& (stride
- 1);
1116 struct free_list_node
*head
, *item
;
1118 pthread_mutex_lock(&pool
->lock
);
1119 clear_alloc_slot(pool
, range
, item_offset
);
1120 /* Add ptr to head of free list */
1121 head
= pool
->free_list_head
;
1122 if (pool
->attr
.poison_set
)
1123 rseq_percpu_poison_item(pool
, range
, item_offset
);
1124 item
= __rseq_percpu_to_free_list_ptr(pool
, _ptr
);
1126 * Setting the next pointer will overwrite the first uintptr_t
1127 * poison for either CPU 0 (populate all) or init data (populate
1131 pool
->free_list_head
= item
;
1132 pthread_mutex_unlock(&pool
->lock
);
1135 struct rseq_mempool_set
*rseq_mempool_set_create(void)
1137 struct rseq_mempool_set
*pool_set
;
1139 pool_set
= calloc(1, sizeof(struct rseq_mempool_set
));
1142 pthread_mutex_init(&pool_set
->lock
, NULL
);
1146 int rseq_mempool_set_destroy(struct rseq_mempool_set
*pool_set
)
1150 for (order
= POOL_SET_MIN_ENTRY
; order
< POOL_SET_NR_ENTRIES
; order
++) {
1151 struct rseq_mempool
*pool
= pool_set
->entries
[order
];
1155 ret
= rseq_mempool_destroy(pool
);
1158 pool_set
->entries
[order
] = NULL
;
1160 pthread_mutex_destroy(&pool_set
->lock
);
1165 /* Ownership of pool is handed over to pool set on success. */
1166 int rseq_mempool_set_add_pool(struct rseq_mempool_set
*pool_set
, struct rseq_mempool
*pool
)
1168 size_t item_order
= pool
->item_order
;
1171 pthread_mutex_lock(&pool_set
->lock
);
1172 if (pool_set
->entries
[item_order
]) {
1177 pool_set
->entries
[pool
->item_order
] = pool
;
1179 pthread_mutex_unlock(&pool_set
->lock
);
1184 void __rseq_percpu
*__rseq_mempool_set_malloc(struct rseq_mempool_set
*pool_set
,
1185 void *init_ptr
, size_t len
, bool zeroed
)
1187 int order
, min_order
= POOL_SET_MIN_ENTRY
;
1188 struct rseq_mempool
*pool
;
1189 void __rseq_percpu
*addr
;
1191 order
= rseq_get_count_order_ulong(len
);
1192 if (order
> POOL_SET_MIN_ENTRY
)
1195 pthread_mutex_lock(&pool_set
->lock
);
1196 /* First smallest present pool where @len fits. */
1197 for (order
= min_order
; order
< POOL_SET_NR_ENTRIES
; order
++) {
1198 pool
= pool_set
->entries
[order
];
1202 if (pool
->item_len
>= len
)
1207 pthread_mutex_unlock(&pool_set
->lock
);
1209 addr
= __rseq_percpu_malloc(pool
, zeroed
, init_ptr
, len
);
1210 if (addr
== NULL
&& errno
== ENOMEM
) {
1212 * If the allocation failed, try again with a
1215 min_order
= order
+ 1;
1226 void __rseq_percpu
*rseq_mempool_set_percpu_malloc(struct rseq_mempool_set
*pool_set
, size_t len
)
1228 return __rseq_mempool_set_malloc(pool_set
, NULL
, len
, false);
1231 void __rseq_percpu
*rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set
*pool_set
, size_t len
)
1233 return __rseq_mempool_set_malloc(pool_set
, NULL
, len
, true);
1236 void __rseq_percpu
*rseq_mempool_set_percpu_malloc_init(struct rseq_mempool_set
*pool_set
,
1237 void *init_ptr
, size_t len
)
1239 return __rseq_mempool_set_malloc(pool_set
, init_ptr
, len
, true);
1242 struct rseq_mempool_attr
*rseq_mempool_attr_create(void)
1244 return calloc(1, sizeof(struct rseq_mempool_attr
));
1247 void rseq_mempool_attr_destroy(struct rseq_mempool_attr
*attr
)
1252 int rseq_mempool_attr_set_mmap(struct rseq_mempool_attr
*attr
,
1253 void *(*mmap_func
)(void *priv
, size_t len
),
1254 int (*munmap_func
)(void *priv
, void *ptr
, size_t len
),
1261 attr
->mmap_set
= true;
1262 attr
->mmap_func
= mmap_func
;
1263 attr
->munmap_func
= munmap_func
;
1264 attr
->mmap_priv
= mmap_priv
;
1268 int rseq_mempool_attr_set_init(struct rseq_mempool_attr
*attr
,
1269 int (*init_func
)(void *priv
, void *addr
, size_t len
, int cpu
),
1276 attr
->init_set
= true;
1277 attr
->init_func
= init_func
;
1278 attr
->init_priv
= init_priv
;
1282 int rseq_mempool_attr_set_robust(struct rseq_mempool_attr
*attr
)
1288 attr
->robust_set
= true;
1292 int rseq_mempool_attr_set_percpu(struct rseq_mempool_attr
*attr
,
1293 size_t stride
, int max_nr_cpus
)
1299 attr
->type
= MEMPOOL_TYPE_PERCPU
;
1300 attr
->stride
= stride
;
1301 attr
->max_nr_cpus
= max_nr_cpus
;
1305 int rseq_mempool_attr_set_global(struct rseq_mempool_attr
*attr
,
1312 attr
->type
= MEMPOOL_TYPE_GLOBAL
;
1313 attr
->stride
= stride
;
1314 attr
->max_nr_cpus
= 0;
1318 int rseq_mempool_attr_set_max_nr_ranges(struct rseq_mempool_attr
*attr
,
1319 unsigned long max_nr_ranges
)
1325 attr
->max_nr_ranges
= max_nr_ranges
;
1329 int rseq_mempool_attr_set_poison(struct rseq_mempool_attr
*attr
,
1336 attr
->poison_set
= true;
1337 attr
->poison
= poison
;
1341 int rseq_mempool_attr_set_populate_policy(struct rseq_mempool_attr
*attr
,
1342 enum rseq_mempool_populate_policy policy
)
1348 attr
->populate_policy
= policy
;
1352 int rseq_mempool_get_max_nr_cpus(struct rseq_mempool
*mempool
)
1354 if (!mempool
|| mempool
->attr
.type
!= MEMPOOL_TYPE_PERCPU
) {
1358 return mempool
->attr
.max_nr_cpus
;