1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3 // SPDX-FileCopyrightText: 2024 Olivier Dion <odion@efficios.com>
5 #include <rseq/mempool.h>
12 #include <rseq/compiler.h>
24 #include "rseq-utils.h"
25 #include <rseq/rseq.h>
28 * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator.
30 * The rseq per-CPU memory allocator allows the application the request
31 * memory pools of CPU-Local memory each of containing objects of a
32 * given size (rounded to next power of 2), reserving a given virtual
33 * address size per CPU, for a given maximum number of CPUs.
35 * The per-CPU memory allocator is analogous to TLS (Thread-Local
36 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
37 * memory allocator provides CPU-Local Storage.
40 #define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG
42 #define POOL_HEADER_NR_PAGES 2
45 * Smallest allocation should hold enough space for a free list pointer.
47 #if RSEQ_BITS_PER_LONG == 64
48 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
50 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
53 #define BIT_PER_ULONG (8 * sizeof(unsigned long))
55 #define MOVE_PAGES_BATCH_SIZE 4096
57 #define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range)
59 #if RSEQ_BITS_PER_LONG == 64
60 # define DEFAULT_COW_INIT_POISON_VALUE 0x5555555555555555ULL
62 # define DEFAULT_COW_INIT_POISON_VALUE 0x55555555UL
66 * Define the default COW_ZERO poison value as zero to prevent useless
67 * COW page allocation when writing poison values when freeing items.
69 #define DEFAULT_COW_ZERO_POISON_VALUE 0x0
71 struct free_list_node
;
73 struct free_list_node
{
74 struct free_list_node
*next
;
78 MEMPOOL_TYPE_GLOBAL
= 0, /* Default */
79 MEMPOOL_TYPE_PERCPU
= 1,
82 struct rseq_mempool_attr
{
84 int (*init_func
)(void *priv
, void *addr
, size_t len
, int cpu
);
89 enum mempool_type type
;
93 unsigned long max_nr_ranges
;
98 enum rseq_mempool_populate_policy populate_policy
;
101 struct rseq_mempool_range
;
103 struct rseq_mempool_range
{
104 struct rseq_mempool_range
*next
; /* Linked list of ranges. */
105 struct rseq_mempool
*pool
; /* Backward reference to container pool. */
108 * Memory layout of a mempool range:
109 * - Canary header page (for detection of destroy-after-fork of
111 * - Header page (contains struct rseq_mempool_range at the
113 * - Base of the per-cpu data, starting with CPU 0.
114 * Aliases with free-list for non-robust COW_ZERO pool.
117 * - CPU max_nr_cpus - 1
118 * - init values (only allocated for COW_INIT pool).
119 * Aliases with free-list for non-robust COW_INIT pool.
120 * - free list (for robust pool).
122 * The free list aliases the CPU 0 memory area for non-robust
123 * COW_ZERO pools. It aliases with init values for non-robust
124 * COW_INIT pools. It is located immediately after the init
125 * values for robust pools.
130 * The init values contains malloc_init/zmalloc values.
131 * Pointer is NULL for RSEQ_MEMPOOL_POPULATE_COW_ZERO.
136 /* Pool range mmap/munmap */
140 /* Track alloc/free. */
141 unsigned long *alloc_bitmap
;
144 struct rseq_mempool
{
145 /* Head of ranges linked-list. */
146 struct rseq_mempool_range
*range_list
;
147 unsigned long nr_ranges
;
153 * COW_INIT non-robust pools:
154 * The free list chains freed items on the init
155 * values address range.
157 * COW_ZERO non-robust pools:
158 * The free list chains freed items on the CPU 0
159 * address range. We should rethink this
160 * decision if false sharing between malloc/free
161 * from other CPUs and data accesses from CPU 0
164 * Robust pools: The free list chains freed items in the
165 * address range dedicated for the free list.
167 * This is a NULL-terminated singly-linked list.
169 struct free_list_node
*free_list_head
;
171 /* This lock protects allocation/free within the pool. */
172 pthread_mutex_t lock
;
174 struct rseq_mempool_attr attr
;
179 * Pool set entries are indexed by item_len rounded to the next power of
180 * 2. A pool set can contain NULL pool entries, in which case the next
181 * large enough entry will be used for allocation.
183 struct rseq_mempool_set
{
184 /* This lock protects add vs malloc/zmalloc within the pool set. */
185 pthread_mutex_t lock
;
186 struct rseq_mempool
*entries
[POOL_SET_NR_ENTRIES
];
190 const char *get_pool_name(const struct rseq_mempool
*pool
)
192 return pool
->name
? : "<anonymous>";
196 void *__rseq_pool_range_percpu_ptr(const struct rseq_mempool_range
*range
, int cpu
,
197 uintptr_t item_offset
, size_t stride
)
199 return range
->base
+ (stride
* cpu
) + item_offset
;
203 void *__rseq_pool_range_init_ptr(const struct rseq_mempool_range
*range
,
204 uintptr_t item_offset
)
208 return range
->init
+ item_offset
;
212 void __rseq_percpu
*__rseq_free_list_to_percpu_ptr(const struct rseq_mempool
*pool
,
213 struct free_list_node
*node
)
215 void __rseq_percpu
*p
= (void __rseq_percpu
*) node
;
217 if (pool
->attr
.robust_set
) {
219 p
-= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
220 /* Skip init values */
221 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
222 p
-= pool
->attr
.stride
;
225 /* COW_INIT free list is in init values */
226 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
227 p
-= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
233 struct free_list_node
*__rseq_percpu_to_free_list_ptr(const struct rseq_mempool
*pool
,
234 void __rseq_percpu
*p
)
236 if (pool
->attr
.robust_set
) {
238 p
+= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
239 /* Skip init values */
240 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
241 p
+= pool
->attr
.stride
;
244 /* COW_INIT free list is in init values */
245 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
246 p
+= pool
->attr
.max_nr_cpus
* pool
->attr
.stride
;
248 return (struct free_list_node
*) p
;
252 intptr_t rseq_cmp_item(void *p
, size_t item_len
, intptr_t cmp_value
, intptr_t *unexpected_value
)
257 for (offset
= 0; offset
< item_len
; offset
+= sizeof(uintptr_t)) {
258 intptr_t v
= *((intptr_t *) (p
+ offset
));
260 if ((res
= v
- cmp_value
) != 0) {
261 if (unexpected_value
)
262 *unexpected_value
= v
;
270 void rseq_percpu_zero_item(struct rseq_mempool
*pool
,
271 struct rseq_mempool_range
*range
, uintptr_t item_offset
)
276 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
278 bzero(init_p
, pool
->item_len
);
279 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
280 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
281 item_offset
, pool
->attr
.stride
);
284 * If item is already zeroed, either because the
285 * init range update has propagated or because the
286 * content is already zeroed (e.g. zero page), don't
287 * write to the page. This eliminates useless COW over
288 * the zero page just for overwriting it with zeroes.
290 * This means zmalloc() in COW_ZERO policy pool do
291 * not trigger COW for CPUs which are not actively
292 * writing to the pool. This is however not the case for
293 * malloc_init() in populate-all pools if it populates
296 if (!rseq_cmp_item(p
, pool
->item_len
, 0, NULL
))
298 bzero(p
, pool
->item_len
);
303 void rseq_percpu_init_item(struct rseq_mempool
*pool
,
304 struct rseq_mempool_range
*range
, uintptr_t item_offset
,
305 void *init_ptr
, size_t init_len
)
310 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
312 memcpy(init_p
, init_ptr
, init_len
);
313 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
314 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
315 item_offset
, pool
->attr
.stride
);
318 * If the update propagated through a shared mapping,
319 * or the item already has the correct content, skip
320 * writing it into the cpu item to eliminate useless
323 if (!memcmp(init_ptr
, p
, init_len
))
325 memcpy(p
, init_ptr
, init_len
);
330 void rseq_poison_item(void *p
, size_t item_len
, uintptr_t poison
)
334 for (offset
= 0; offset
< item_len
; offset
+= sizeof(uintptr_t))
335 *((uintptr_t *) (p
+ offset
)) = poison
;
339 void rseq_percpu_poison_item(struct rseq_mempool
*pool
,
340 struct rseq_mempool_range
*range
, uintptr_t item_offset
)
342 uintptr_t poison
= pool
->attr
.poison
;
346 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
348 rseq_poison_item(init_p
, pool
->item_len
, poison
);
349 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
350 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
351 item_offset
, pool
->attr
.stride
);
354 * If the update propagated through a shared mapping,
355 * or the item already has the correct content, skip
356 * writing it into the cpu item to eliminate useless
359 * It is recommended to use zero as poison value for
360 * COW_ZERO pools to eliminate COW due to writing
361 * poison to CPU memory still backed by the zero page.
363 if (rseq_cmp_item(p
, pool
->item_len
, poison
, NULL
) == 0)
365 rseq_poison_item(p
, pool
->item_len
, poison
);
369 /* Always inline for __builtin_return_address(0). */
370 static inline __attribute__((always_inline
))
371 void rseq_check_poison_item(const struct rseq_mempool
*pool
, uintptr_t item_offset
,
372 void *p
, size_t item_len
, uintptr_t poison
)
374 intptr_t unexpected_value
;
376 if (rseq_cmp_item(p
, item_len
, poison
, &unexpected_value
) == 0)
379 fprintf(stderr
, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
380 __func__
, (unsigned long) unexpected_value
, get_pool_name(pool
), pool
, item_offset
, (void *) __builtin_return_address(0));
384 /* Always inline for __builtin_return_address(0). */
385 static inline __attribute__((always_inline
))
386 void rseq_percpu_check_poison_item(const struct rseq_mempool
*pool
,
387 const struct rseq_mempool_range
*range
, uintptr_t item_offset
)
389 uintptr_t poison
= pool
->attr
.poison
;
393 if (!pool
->attr
.robust_set
)
395 init_p
= __rseq_pool_range_init_ptr(range
, item_offset
);
397 rseq_check_poison_item(pool
, item_offset
, init_p
, pool
->item_len
, poison
);
398 for (i
= 0; i
< pool
->attr
.max_nr_cpus
; i
++) {
399 char *p
= __rseq_pool_range_percpu_ptr(range
, i
,
400 item_offset
, pool
->attr
.stride
);
401 rseq_check_poison_item(pool
, item_offset
, p
, pool
->item_len
, poison
);
406 int rseq_mempool_range_init_numa(void *addr
, size_t len
, int cpu
, int numa_flags
)
408 unsigned long nr_pages
, page_len
;
409 int status
[MOVE_PAGES_BATCH_SIZE
];
410 int nodes
[MOVE_PAGES_BATCH_SIZE
];
411 void *pages
[MOVE_PAGES_BATCH_SIZE
];
418 page_len
= rseq_get_page_len();
419 nr_pages
= len
>> rseq_get_count_order_ulong(page_len
);
421 nodes
[0] = numa_node_of_cpu(cpu
);
425 for (size_t k
= 1; k
< RSEQ_ARRAY_SIZE(nodes
); ++k
) {
429 for (unsigned long page
= 0; page
< nr_pages
;) {
431 size_t max_k
= RSEQ_ARRAY_SIZE(pages
);
432 size_t left
= nr_pages
- page
;
438 for (size_t k
= 0; k
< max_k
; ++k
, ++page
) {
439 pages
[k
] = addr
+ (page
* page_len
);
443 ret
= move_pages(0, max_k
, pages
, nodes
, status
, numa_flags
);
449 fprintf(stderr
, "%lu pages were not migrated\n", ret
);
450 for (size_t k
= 0; k
< max_k
; ++k
) {
453 "Error while moving page %p to numa node %d: %u\n",
454 pages
[k
], nodes
[k
], -status
[k
]);
461 int rseq_mempool_range_init_numa(void *addr
__attribute__((unused
)),
462 size_t len
__attribute__((unused
)),
463 int cpu
__attribute__((unused
)),
464 int numa_flags
__attribute__((unused
)))
472 int create_alloc_bitmap(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
)
476 count
= ((pool
->attr
.stride
>> pool
->item_order
) + BIT_PER_ULONG
- 1) / BIT_PER_ULONG
;
479 * Not being able to create the validation bitmap is an error
480 * that needs to be reported.
482 range
->alloc_bitmap
= calloc(count
, sizeof(unsigned long));
483 if (!range
->alloc_bitmap
)
489 bool percpu_addr_in_pool(const struct rseq_mempool
*pool
, void __rseq_percpu
*_addr
)
491 struct rseq_mempool_range
*range
;
492 void *addr
= (void *) _addr
;
494 for (range
= pool
->range_list
; range
; range
= range
->next
) {
495 if (addr
>= range
->base
&& addr
< range
->base
+ range
->next_unused
)
501 /* Always inline for __builtin_return_address(0). */
502 static inline __attribute__((always_inline
))
503 void check_free_list(const struct rseq_mempool
*pool
, bool mapping_accessible
)
505 size_t total_item
= 0, total_never_allocated
= 0, total_freed
= 0,
506 max_list_traversal
= 0, traversal_iteration
= 0;
507 struct rseq_mempool_range
*range
;
509 if (!pool
->attr
.robust_set
|| !mapping_accessible
)
512 for (range
= pool
->range_list
; range
; range
= range
->next
) {
513 total_item
+= pool
->attr
.stride
>> pool
->item_order
;
514 total_never_allocated
+= (pool
->attr
.stride
- range
->next_unused
) >> pool
->item_order
;
516 max_list_traversal
= total_item
- total_never_allocated
;
518 for (struct free_list_node
*node
= pool
->free_list_head
, *prev
= NULL
;
523 if (traversal_iteration
>= max_list_traversal
) {
524 fprintf(stderr
, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n",
525 __func__
, get_pool_name(pool
), pool
, __builtin_return_address(0));
529 /* Node is out of range. */
530 if (!percpu_addr_in_pool(pool
, __rseq_free_list_to_percpu_ptr(pool
, node
))) {
532 fprintf(stderr
, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
533 __func__
, prev
, node
, get_pool_name(pool
), pool
, __builtin_return_address(0));
535 fprintf(stderr
, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
536 __func__
, node
, get_pool_name(pool
), pool
, __builtin_return_address(0));
540 traversal_iteration
++;
544 if (total_never_allocated
+ total_freed
!= total_item
) {
545 fprintf(stderr
, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n",
546 __func__
, get_pool_name(pool
), pool
, total_item
, total_never_allocated
, total_freed
, __builtin_return_address(0));
551 /* Always inline for __builtin_return_address(0). */
552 static inline __attribute__((always_inline
))
553 void check_range_poison(const struct rseq_mempool
*pool
,
554 const struct rseq_mempool_range
*range
)
558 for (item_offset
= 0; item_offset
< range
->next_unused
;
559 item_offset
+= pool
->item_len
)
560 rseq_percpu_check_poison_item(pool
, range
, item_offset
);
563 /* Always inline for __builtin_return_address(0). */
564 static inline __attribute__((always_inline
))
565 void check_pool_poison(const struct rseq_mempool
*pool
, bool mapping_accessible
)
567 struct rseq_mempool_range
*range
;
569 if (!pool
->attr
.robust_set
|| !mapping_accessible
)
571 for (range
= pool
->range_list
; range
; range
= range
->next
)
572 check_range_poison(pool
, range
);
575 /* Always inline for __builtin_return_address(0). */
576 static inline __attribute__((always_inline
))
577 void destroy_alloc_bitmap(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
)
579 unsigned long *bitmap
= range
->alloc_bitmap
;
580 size_t count
, total_leaks
= 0;
585 count
= ((pool
->attr
.stride
>> pool
->item_order
) + BIT_PER_ULONG
- 1) / BIT_PER_ULONG
;
587 /* Assert that all items in the pool were freed. */
588 for (size_t k
= 0; k
< count
; ++k
)
589 total_leaks
+= rseq_hweight_ulong(bitmap
[k
]);
591 fprintf(stderr
, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n",
592 __func__
, get_pool_name(pool
), pool
, total_leaks
, (void *) __builtin_return_address(0));
597 range
->alloc_bitmap
= NULL
;
600 /* Always inline for __builtin_return_address(0). */
601 static inline __attribute__((always_inline
))
602 int rseq_mempool_range_destroy(struct rseq_mempool
*pool
,
603 struct rseq_mempool_range
*range
,
604 bool mapping_accessible
)
606 destroy_alloc_bitmap(pool
, range
);
607 if (!mapping_accessible
) {
609 * Only the header pages are populated in the child
612 return munmap(range
->header
, POOL_HEADER_NR_PAGES
* rseq_get_page_len());
614 return munmap(range
->mmap_addr
, range
->mmap_len
);
618 * Allocate a memory mapping aligned on @alignment, with an optional
619 * @pre_header before the mapping.
622 void *aligned_mmap_anonymous(size_t page_size
, size_t len
, size_t alignment
,
623 void **pre_header
, size_t pre_header_len
)
625 size_t minimum_page_count
, page_count
, extra
, total_allocate
= 0;
629 if (len
< page_size
|| alignment
< page_size
||
630 !is_pow2(alignment
) || (len
& (alignment
- 1))) {
634 page_order
= rseq_get_count_order_ulong(page_size
);
635 if (page_order
< 0) {
639 if (pre_header_len
&& (pre_header_len
& (page_size
- 1))) {
644 minimum_page_count
= (pre_header_len
+ len
) >> page_order
;
645 page_count
= (pre_header_len
+ len
+ alignment
- page_size
) >> page_order
;
647 assert(page_count
>= minimum_page_count
);
649 ptr
= mmap(NULL
, page_count
<< page_order
, PROT_READ
| PROT_WRITE
,
650 MAP_ANONYMOUS
| MAP_PRIVATE
, -1, 0);
651 if (ptr
== MAP_FAILED
) {
656 total_allocate
= page_count
<< page_order
;
658 if (!(((uintptr_t) ptr
+ pre_header_len
) & (alignment
- 1))) {
659 /* Pointer is already aligned. ptr points to pre_header. */
663 /* Unmap extra before. */
664 extra
= offset_align((uintptr_t) ptr
+ pre_header_len
, alignment
);
665 assert(!(extra
& (page_size
- 1)));
666 if (munmap(ptr
, extra
)) {
670 total_allocate
-= extra
;
671 ptr
+= extra
; /* ptr points to pre_header */
672 page_count
-= extra
>> page_order
;
674 assert(page_count
>= minimum_page_count
);
676 if (page_count
> minimum_page_count
) {
679 /* Unmap extra after. */
680 extra_ptr
= ptr
+ (minimum_page_count
<< page_order
);
681 extra
= (page_count
- minimum_page_count
) << page_order
;
682 if (munmap(extra_ptr
, extra
)) {
686 total_allocate
-= extra
;
689 assert(!(((uintptr_t)ptr
+ pre_header_len
) & (alignment
- 1)));
690 assert(total_allocate
== len
+ pre_header_len
);
696 ptr
+= pre_header_len
;
702 int rseq_memfd_create_init(const char *poolname
, size_t init_len
)
705 char buf
[249]; /* Limit is 249 bytes. */
709 snprintf(buf
, sizeof(buf
), "%s:rseq-mempool", poolname
);
712 name
= "<anonymous>:rseq-mempool";
715 fd
= memfd_create(name
, MFD_CLOEXEC
);
717 perror("memfd_create");
720 if (ftruncate(fd
, (off_t
) init_len
)) {
731 void rseq_memfd_close(int fd
)
740 struct rseq_mempool_range
*rseq_mempool_range_create(struct rseq_mempool
*pool
)
742 struct rseq_mempool_range
*range
;
743 unsigned long page_size
;
746 size_t range_len
; /* Range len excludes header. */
750 if (pool
->attr
.max_nr_ranges
&&
751 pool
->nr_ranges
>= pool
->attr
.max_nr_ranges
) {
755 page_size
= rseq_get_page_len();
757 header_len
= POOL_HEADER_NR_PAGES
* page_size
;
758 range_len
= pool
->attr
.stride
* pool
->attr
.max_nr_cpus
;
759 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
760 range_len
+= pool
->attr
.stride
; /* init values */
761 if (pool
->attr
.robust_set
)
762 range_len
+= pool
->attr
.stride
; /* dedicated free list */
763 base
= aligned_mmap_anonymous(page_size
, range_len
,
764 pool
->attr
.stride
, &header
, header_len
);
767 range
= (struct rseq_mempool_range
*) (base
- RANGE_HEADER_OFFSET
);
769 range
->header
= header
;
771 range
->mmap_addr
= header
;
772 range
->mmap_len
= header_len
+ range_len
;
774 if (pool
->attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
) {
775 range
->init
= base
+ (pool
->attr
.stride
* pool
->attr
.max_nr_cpus
);
776 /* Populate init values pages from memfd */
777 memfd
= rseq_memfd_create_init(pool
->name
, pool
->attr
.stride
);
780 if (mmap(range
->init
, pool
->attr
.stride
, PROT_READ
| PROT_WRITE
,
781 MAP_SHARED
| MAP_FIXED
, memfd
, 0) != (void *) range
->init
)
783 assert(pool
->attr
.type
== MEMPOOL_TYPE_PERCPU
);
785 * Map per-cpu memory as private COW mappings of init values.
790 for (cpu
= 0; cpu
< pool
->attr
.max_nr_cpus
; cpu
++) {
791 void *p
= base
+ (pool
->attr
.stride
* cpu
);
792 size_t len
= pool
->attr
.stride
;
794 if (mmap(p
, len
, PROT_READ
| PROT_WRITE
, MAP_PRIVATE
| MAP_FIXED
,
795 memfd
, 0) != (void *) p
)
800 * The init values shared mapping should not be shared
801 * with the children processes across fork. Prevent the
802 * whole mapping from being used across fork.
804 if (madvise(base
, range_len
, MADV_DONTFORK
))
808 * Write 0x1 in first byte of header first page, which
809 * will be WIPEONFORK (and thus cleared) in children
810 * processes. Used to find out if pool destroy is called
811 * from a child process after fork.
813 *((char *) header
) = 0x1;
814 if (madvise(header
, page_size
, MADV_WIPEONFORK
))
818 * The second header page contains the struct
819 * rseq_mempool_range, which is needed by pool destroy.
820 * Leave this anonymous page populated (COW) in child
823 rseq_memfd_close(memfd
);
827 if (pool
->attr
.robust_set
) {
828 if (create_alloc_bitmap(pool
, range
))
831 if (pool
->attr
.init_set
) {
832 switch (pool
->attr
.type
) {
833 case MEMPOOL_TYPE_GLOBAL
:
834 if (pool
->attr
.init_func(pool
->attr
.init_priv
,
835 base
, pool
->attr
.stride
, -1)) {
839 case MEMPOOL_TYPE_PERCPU
:
842 for (cpu
= 0; cpu
< pool
->attr
.max_nr_cpus
; cpu
++) {
843 if (pool
->attr
.init_func(pool
->attr
.init_priv
,
844 base
+ (pool
->attr
.stride
* cpu
),
845 pool
->attr
.stride
, cpu
)) {
859 rseq_memfd_close(memfd
);
860 (void) rseq_mempool_range_destroy(pool
, range
, true);
865 bool pool_mappings_accessible(struct rseq_mempool
*pool
)
867 struct rseq_mempool_range
*range
;
871 if (pool
->attr
.populate_policy
!= RSEQ_MEMPOOL_POPULATE_COW_INIT
)
873 range
= pool
->range_list
;
876 page_size
= rseq_get_page_len();
878 * Header first page is one page before the page containing the
881 addr
= (char *) ((uintptr_t) range
& ~(page_size
- 1)) - page_size
;
883 * Look for 0x1 first byte marker in header first page.
890 int rseq_mempool_destroy(struct rseq_mempool
*pool
)
892 struct rseq_mempool_range
*range
, *next_range
;
893 bool mapping_accessible
;
900 * Validate that the pool mappings are accessible before doing
901 * free list/poison validation and unmapping ranges. This allows
902 * calling pool destroy in child process after a fork for COW_INIT
903 * pools to free pool resources.
905 mapping_accessible
= pool_mappings_accessible(pool
);
907 check_free_list(pool
, mapping_accessible
);
908 check_pool_poison(pool
, mapping_accessible
);
910 /* Iteration safe against removal. */
911 for (range
= pool
->range_list
; range
&& (next_range
= range
->next
, 1); range
= next_range
) {
912 if (rseq_mempool_range_destroy(pool
, range
, mapping_accessible
))
914 /* Update list head to keep list coherent in case of partial failure. */
915 pool
->range_list
= next_range
;
917 pthread_mutex_destroy(&pool
->lock
);
924 struct rseq_mempool
*rseq_mempool_create(const char *pool_name
,
925 size_t item_len
, const struct rseq_mempool_attr
*_attr
)
927 struct rseq_mempool
*pool
;
928 struct rseq_mempool_attr attr
= {};
931 /* Make sure each item is large enough to contain free list pointers. */
932 if (item_len
< sizeof(void *))
933 item_len
= sizeof(void *);
935 /* Align item_len on next power of two. */
936 order
= rseq_get_count_order_ulong(item_len
);
941 item_len
= 1UL << order
;
944 memcpy(&attr
, _attr
, sizeof(attr
));
947 * Validate that the pool populate policy requested is known.
949 switch (attr
.populate_policy
) {
950 case RSEQ_MEMPOOL_POPULATE_COW_INIT
:
952 case RSEQ_MEMPOOL_POPULATE_COW_ZERO
:
960 case MEMPOOL_TYPE_PERCPU
:
961 if (attr
.max_nr_cpus
< 0) {
965 if (attr
.max_nr_cpus
== 0) {
967 attr
.max_nr_cpus
= rseq_get_max_nr_cpus();
968 if (attr
.max_nr_cpus
== 0) {
974 case MEMPOOL_TYPE_GLOBAL
:
975 /* Override populate policy for global type. */
976 if (attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
977 attr
.populate_policy
= RSEQ_MEMPOOL_POPULATE_COW_ZERO
;
978 /* Use a 1-cpu pool for global mempool type. */
979 attr
.max_nr_cpus
= 1;
983 attr
.stride
= RSEQ_MEMPOOL_STRIDE
; /* Use default */
984 if (attr
.robust_set
&& !attr
.poison_set
) {
985 attr
.poison_set
= true;
986 if (attr
.populate_policy
== RSEQ_MEMPOOL_POPULATE_COW_INIT
)
987 attr
.poison
= DEFAULT_COW_INIT_POISON_VALUE
;
989 attr
.poison
= DEFAULT_COW_ZERO_POISON_VALUE
;
991 if (item_len
> attr
.stride
|| attr
.stride
< (size_t) rseq_get_page_len() ||
992 !is_pow2(attr
.stride
)) {
997 pool
= calloc(1, sizeof(struct rseq_mempool
));
1001 memcpy(&pool
->attr
, &attr
, sizeof(attr
));
1002 pthread_mutex_init(&pool
->lock
, NULL
);
1003 pool
->item_len
= item_len
;
1004 pool
->item_order
= order
;
1006 pool
->range_list
= rseq_mempool_range_create(pool
);
1007 if (!pool
->range_list
)
1011 pool
->name
= strdup(pool_name
);
1018 rseq_mempool_destroy(pool
);
1023 /* Always inline for __builtin_return_address(0). */
1024 static inline __attribute__((always_inline
))
1025 void set_alloc_slot(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
, size_t item_offset
)
1027 unsigned long *bitmap
= range
->alloc_bitmap
;
1028 size_t item_index
= item_offset
>> pool
->item_order
;
1035 k
= item_index
/ BIT_PER_ULONG
;
1036 mask
= 1ULL << (item_index
% BIT_PER_ULONG
);
1038 /* Print error if bit is already set. */
1039 if (bitmap
[k
] & mask
) {
1040 fprintf(stderr
, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
1041 __func__
, get_pool_name(pool
), pool
, item_offset
, (void *) __builtin_return_address(0));
1048 void __rseq_percpu
*__rseq_percpu_malloc(struct rseq_mempool
*pool
,
1049 bool zeroed
, void *init_ptr
, size_t init_len
)
1051 struct rseq_mempool_range
*range
;
1052 struct free_list_node
*node
;
1053 uintptr_t item_offset
;
1054 void __rseq_percpu
*addr
;
1056 if (init_len
> pool
->item_len
) {
1060 pthread_mutex_lock(&pool
->lock
);
1061 /* Get first entry from free list. */
1062 node
= pool
->free_list_head
;
1064 void *range_base
, *ptr
;
1066 ptr
= __rseq_free_list_to_percpu_ptr(pool
, node
);
1067 range_base
= (void *) ((uintptr_t) ptr
& (~(pool
->attr
.stride
- 1)));
1068 range
= (struct rseq_mempool_range
*) (range_base
- RANGE_HEADER_OFFSET
);
1069 /* Remove node from free list (update head). */
1070 pool
->free_list_head
= node
->next
;
1071 item_offset
= (uintptr_t) (ptr
- range_base
);
1072 rseq_percpu_check_poison_item(pool
, range
, item_offset
);
1073 addr
= __rseq_free_list_to_percpu_ptr(pool
, node
);
1077 * If the most recent range (first in list) does not have any
1078 * room left, create a new range and prepend it to the list
1081 range
= pool
->range_list
;
1082 if (range
->next_unused
+ pool
->item_len
> pool
->attr
.stride
) {
1083 range
= rseq_mempool_range_create(pool
);
1089 /* Add range to head of list. */
1090 range
->next
= pool
->range_list
;
1091 pool
->range_list
= range
;
1093 /* First range in list has room left. */
1094 item_offset
= range
->next_unused
;
1095 addr
= (void __rseq_percpu
*) (range
->base
+ item_offset
);
1096 range
->next_unused
+= pool
->item_len
;
1099 set_alloc_slot(pool
, range
, item_offset
);
1100 pthread_mutex_unlock(&pool
->lock
);
1103 rseq_percpu_zero_item(pool
, range
, item_offset
);
1104 else if (init_ptr
) {
1105 rseq_percpu_init_item(pool
, range
, item_offset
,
1106 init_ptr
, init_len
);
1112 void __rseq_percpu
*rseq_mempool_percpu_malloc(struct rseq_mempool
*pool
)
1114 return __rseq_percpu_malloc(pool
, false, NULL
, 0);
1117 void __rseq_percpu
*rseq_mempool_percpu_zmalloc(struct rseq_mempool
*pool
)
1119 return __rseq_percpu_malloc(pool
, true, NULL
, 0);
1122 void __rseq_percpu
*rseq_mempool_percpu_malloc_init(struct rseq_mempool
*pool
,
1123 void *init_ptr
, size_t len
)
1125 return __rseq_percpu_malloc(pool
, false, init_ptr
, len
);
1128 /* Always inline for __builtin_return_address(0). */
1129 static inline __attribute__((always_inline
))
1130 void clear_alloc_slot(struct rseq_mempool
*pool
, struct rseq_mempool_range
*range
, size_t item_offset
)
1132 unsigned long *bitmap
= range
->alloc_bitmap
;
1133 size_t item_index
= item_offset
>> pool
->item_order
;
1140 k
= item_index
/ BIT_PER_ULONG
;
1141 mask
= 1ULL << (item_index
% BIT_PER_ULONG
);
1143 /* Print error if bit is not set. */
1144 if (!(bitmap
[k
] & mask
)) {
1145 fprintf(stderr
, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
1146 __func__
, get_pool_name(pool
), pool
, item_offset
,
1147 (void *) __builtin_return_address(0));
1153 void librseq_mempool_percpu_free(void __rseq_percpu
*_ptr
, size_t stride
)
1155 uintptr_t ptr
= (uintptr_t) _ptr
;
1156 void *range_base
= (void *) (ptr
& (~(stride
- 1)));
1157 struct rseq_mempool_range
*range
= (struct rseq_mempool_range
*) (range_base
- RANGE_HEADER_OFFSET
);
1158 struct rseq_mempool
*pool
= range
->pool
;
1159 uintptr_t item_offset
= ptr
& (stride
- 1);
1160 struct free_list_node
*head
, *item
;
1162 pthread_mutex_lock(&pool
->lock
);
1163 clear_alloc_slot(pool
, range
, item_offset
);
1164 /* Add ptr to head of free list */
1165 head
= pool
->free_list_head
;
1166 if (pool
->attr
.poison_set
)
1167 rseq_percpu_poison_item(pool
, range
, item_offset
);
1168 item
= __rseq_percpu_to_free_list_ptr(pool
, _ptr
);
1170 * Setting the next pointer will overwrite the first uintptr_t
1171 * poison for either CPU 0 (COW_ZERO, non-robust), or init data
1172 * (COW_INIT, non-robust).
1175 pool
->free_list_head
= item
;
1176 pthread_mutex_unlock(&pool
->lock
);
1179 struct rseq_mempool_set
*rseq_mempool_set_create(void)
1181 struct rseq_mempool_set
*pool_set
;
1183 pool_set
= calloc(1, sizeof(struct rseq_mempool_set
));
1186 pthread_mutex_init(&pool_set
->lock
, NULL
);
1190 int rseq_mempool_set_destroy(struct rseq_mempool_set
*pool_set
)
1194 for (order
= POOL_SET_MIN_ENTRY
; order
< POOL_SET_NR_ENTRIES
; order
++) {
1195 struct rseq_mempool
*pool
= pool_set
->entries
[order
];
1199 ret
= rseq_mempool_destroy(pool
);
1202 pool_set
->entries
[order
] = NULL
;
1204 pthread_mutex_destroy(&pool_set
->lock
);
1209 /* Ownership of pool is handed over to pool set on success. */
1210 int rseq_mempool_set_add_pool(struct rseq_mempool_set
*pool_set
, struct rseq_mempool
*pool
)
1212 size_t item_order
= pool
->item_order
;
1215 pthread_mutex_lock(&pool_set
->lock
);
1216 if (pool_set
->entries
[item_order
]) {
1221 pool_set
->entries
[pool
->item_order
] = pool
;
1223 pthread_mutex_unlock(&pool_set
->lock
);
1228 void __rseq_percpu
*__rseq_mempool_set_malloc(struct rseq_mempool_set
*pool_set
,
1229 void *init_ptr
, size_t len
, bool zeroed
)
1231 int order
, min_order
= POOL_SET_MIN_ENTRY
;
1232 struct rseq_mempool
*pool
;
1233 void __rseq_percpu
*addr
;
1235 order
= rseq_get_count_order_ulong(len
);
1236 if (order
> POOL_SET_MIN_ENTRY
)
1239 pthread_mutex_lock(&pool_set
->lock
);
1240 /* First smallest present pool where @len fits. */
1241 for (order
= min_order
; order
< POOL_SET_NR_ENTRIES
; order
++) {
1242 pool
= pool_set
->entries
[order
];
1246 if (pool
->item_len
>= len
)
1251 pthread_mutex_unlock(&pool_set
->lock
);
1253 addr
= __rseq_percpu_malloc(pool
, zeroed
, init_ptr
, len
);
1254 if (addr
== NULL
&& errno
== ENOMEM
) {
1256 * If the allocation failed, try again with a
1259 min_order
= order
+ 1;
1270 void __rseq_percpu
*rseq_mempool_set_percpu_malloc(struct rseq_mempool_set
*pool_set
, size_t len
)
1272 return __rseq_mempool_set_malloc(pool_set
, NULL
, len
, false);
1275 void __rseq_percpu
*rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set
*pool_set
, size_t len
)
1277 return __rseq_mempool_set_malloc(pool_set
, NULL
, len
, true);
1280 void __rseq_percpu
*rseq_mempool_set_percpu_malloc_init(struct rseq_mempool_set
*pool_set
,
1281 void *init_ptr
, size_t len
)
1283 return __rseq_mempool_set_malloc(pool_set
, init_ptr
, len
, true);
1286 struct rseq_mempool_attr
*rseq_mempool_attr_create(void)
1288 return calloc(1, sizeof(struct rseq_mempool_attr
));
1291 void rseq_mempool_attr_destroy(struct rseq_mempool_attr
*attr
)
1296 int rseq_mempool_attr_set_init(struct rseq_mempool_attr
*attr
,
1297 int (*init_func
)(void *priv
, void *addr
, size_t len
, int cpu
),
1304 attr
->init_set
= true;
1305 attr
->init_func
= init_func
;
1306 attr
->init_priv
= init_priv
;
1307 attr
->populate_policy
= RSEQ_MEMPOOL_POPULATE_COW_INIT
;
1311 int rseq_mempool_attr_set_robust(struct rseq_mempool_attr
*attr
)
1317 attr
->robust_set
= true;
1321 int rseq_mempool_attr_set_percpu(struct rseq_mempool_attr
*attr
,
1322 size_t stride
, int max_nr_cpus
)
1328 attr
->type
= MEMPOOL_TYPE_PERCPU
;
1329 attr
->stride
= stride
;
1330 attr
->max_nr_cpus
= max_nr_cpus
;
1334 int rseq_mempool_attr_set_global(struct rseq_mempool_attr
*attr
,
1341 attr
->type
= MEMPOOL_TYPE_GLOBAL
;
1342 attr
->stride
= stride
;
1343 attr
->max_nr_cpus
= 0;
1347 int rseq_mempool_attr_set_max_nr_ranges(struct rseq_mempool_attr
*attr
,
1348 unsigned long max_nr_ranges
)
1354 attr
->max_nr_ranges
= max_nr_ranges
;
1358 int rseq_mempool_attr_set_poison(struct rseq_mempool_attr
*attr
,
1365 attr
->poison_set
= true;
1366 attr
->poison
= poison
;
1370 int rseq_mempool_attr_set_populate_policy(struct rseq_mempool_attr
*attr
,
1371 enum rseq_mempool_populate_policy policy
)
1377 attr
->populate_policy
= policy
;
1381 int rseq_mempool_get_max_nr_cpus(struct rseq_mempool
*mempool
)
1383 if (!mempool
|| mempool
->attr
.type
!= MEMPOOL_TYPE_PERCPU
) {
1387 return mempool
->attr
.max_nr_cpus
;