mempool: Protect whole range of populate none across fork
[librseq.git] / src / rseq-mempool.c
1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3
4 #include <rseq/mempool.h>
5 #include <sys/mman.h>
6 #include <assert.h>
7 #include <string.h>
8 #include <pthread.h>
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <rseq/compiler.h>
12 #include <errno.h>
13 #include <stdint.h>
14 #include <stdbool.h>
15 #include <stdio.h>
16 #include <fcntl.h>
17
18 #ifdef HAVE_LIBNUMA
19 # include <numa.h>
20 # include <numaif.h>
21 #endif
22
23 #include "rseq-utils.h"
24 #include <rseq/rseq.h>
25
26 /*
27 * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator.
28 *
29 * The rseq per-CPU memory allocator allows the application the request
30 * memory pools of CPU-Local memory each of containing objects of a
31 * given size (rounded to next power of 2), reserving a given virtual
32 * address size per CPU, for a given maximum number of CPUs.
33 *
34 * The per-CPU memory allocator is analogous to TLS (Thread-Local
35 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
36 * memory allocator provides CPU-Local Storage.
37 */
38
39 #define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG
40
41 /*
42 * Smallest allocation should hold enough space for a free list pointer.
43 */
44 #if RSEQ_BITS_PER_LONG == 64
45 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
46 #else
47 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
48 #endif
49
50 #define BIT_PER_ULONG (8 * sizeof(unsigned long))
51
52 #define MOVE_PAGES_BATCH_SIZE 4096
53
54 #define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range)
55
56 #if RSEQ_BITS_PER_LONG == 64
57 # define DEFAULT_PRIVATE_POISON_VALUE 0x5555555555555555ULL
58 #else
59 # define DEFAULT_PRIVATE_POISON_VALUE 0x55555555UL
60 #endif
61
62 struct free_list_node;
63
64 struct free_list_node {
65 struct free_list_node *next;
66 };
67
68 enum mempool_type {
69 MEMPOOL_TYPE_GLOBAL = 0, /* Default */
70 MEMPOOL_TYPE_PERCPU = 1,
71 };
72
73 struct rseq_mempool_attr {
74 bool init_set;
75 int (*init_func)(void *priv, void *addr, size_t len, int cpu);
76 void *init_priv;
77
78 bool robust_set;
79
80 enum mempool_type type;
81 size_t stride;
82 int max_nr_cpus;
83
84 unsigned long max_nr_ranges;
85
86 bool poison_set;
87 uintptr_t poison;
88
89 enum rseq_mempool_populate_policy populate_policy;
90 };
91
92 struct rseq_mempool_range;
93
94 struct rseq_mempool_range {
95 struct rseq_mempool_range *next; /* Linked list of ranges. */
96 struct rseq_mempool *pool; /* Backward reference to container pool. */
97
98 /*
99 * Memory layout of a mempool range:
100 * - Header page (contains struct rseq_mempool_range at the very end),
101 * - Base of the per-cpu data, starting with CPU 0.
102 * Aliases with free-list for non-robust populate all pool.
103 * - CPU 1,
104 * ...
105 * - CPU max_nr_cpus - 1
106 * - init values (unpopulated for RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL).
107 * Aliases with free-list for non-robust populate none pool.
108 * - free list (for robust pool).
109 *
110 * The free list aliases the CPU 0 memory area for non-robust
111 * populate all pools. It aliases with init values for
112 * non-robust populate none pools. It is located immediately
113 * after the init values for robust pools.
114 */
115 void *header;
116 void *base;
117 /*
118 * The init values contains malloc_init/zmalloc values.
119 * Pointer is NULL for RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL.
120 */
121 void *init;
122 size_t next_unused;
123
124 /* Pool range mmap/munmap */
125 void *mmap_addr;
126 size_t mmap_len;
127
128 /* Track alloc/free. */
129 unsigned long *alloc_bitmap;
130 };
131
132 struct rseq_mempool {
133 /* Head of ranges linked-list. */
134 struct rseq_mempool_range *range_list;
135 unsigned long nr_ranges;
136
137 size_t item_len;
138 int item_order;
139
140 /*
141 * The free list chains freed items on the CPU 0 address range.
142 * We should rethink this decision if false sharing between
143 * malloc/free from other CPUs and data accesses from CPU 0
144 * becomes an issue. This is a NULL-terminated singly-linked
145 * list.
146 */
147 struct free_list_node *free_list_head;
148
149 /* This lock protects allocation/free within the pool. */
150 pthread_mutex_t lock;
151
152 struct rseq_mempool_attr attr;
153 char *name;
154 };
155
156 /*
157 * Pool set entries are indexed by item_len rounded to the next power of
158 * 2. A pool set can contain NULL pool entries, in which case the next
159 * large enough entry will be used for allocation.
160 */
161 struct rseq_mempool_set {
162 /* This lock protects add vs malloc/zmalloc within the pool set. */
163 pthread_mutex_t lock;
164 struct rseq_mempool *entries[POOL_SET_NR_ENTRIES];
165 };
166
167 static
168 const char *get_pool_name(const struct rseq_mempool *pool)
169 {
170 return pool->name ? : "<anonymous>";
171 }
172
173 static
174 void *__rseq_pool_range_percpu_ptr(const struct rseq_mempool_range *range, int cpu,
175 uintptr_t item_offset, size_t stride)
176 {
177 return range->base + (stride * cpu) + item_offset;
178 }
179
180 static
181 void *__rseq_pool_range_init_ptr(const struct rseq_mempool_range *range,
182 uintptr_t item_offset)
183 {
184 if (!range->init)
185 return NULL;
186 return range->init + item_offset;
187 }
188
189 static
190 void __rseq_percpu *__rseq_free_list_to_percpu_ptr(const struct rseq_mempool *pool,
191 struct free_list_node *node)
192 {
193 void __rseq_percpu *p = (void __rseq_percpu *) node;
194
195 if (pool->attr.robust_set) {
196 /* Skip cpus. */
197 p -= pool->attr.max_nr_cpus * pool->attr.stride;
198 /* Skip init values */
199 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL)
200 p -= pool->attr.stride;
201
202 } else {
203 /* Populate none free list is in init values */
204 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL)
205 p -= pool->attr.max_nr_cpus * pool->attr.stride;
206 }
207 return p;
208 }
209
210 static
211 struct free_list_node *__rseq_percpu_to_free_list_ptr(const struct rseq_mempool *pool,
212 void __rseq_percpu *p)
213 {
214 if (pool->attr.robust_set) {
215 /* Skip cpus. */
216 p += pool->attr.max_nr_cpus * pool->attr.stride;
217 /* Skip init values */
218 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL)
219 p += pool->attr.stride;
220
221 } else {
222 /* Populate none free list is in init values */
223 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL)
224 p += pool->attr.max_nr_cpus * pool->attr.stride;
225 }
226 return (struct free_list_node *) p;
227 }
228
229 static
230 intptr_t rseq_cmp_item(void *p, size_t item_len, intptr_t cmp_value, intptr_t *unexpected_value)
231 {
232 size_t offset;
233 intptr_t res = 0;
234
235 for (offset = 0; offset < item_len; offset += sizeof(uintptr_t)) {
236 intptr_t v = *((intptr_t *) (p + offset));
237
238 if ((res = v - cmp_value) != 0) {
239 if (unexpected_value)
240 *unexpected_value = v;
241 break;
242 }
243 }
244 return res;
245 }
246
247 static
248 void rseq_percpu_zero_item(struct rseq_mempool *pool,
249 struct rseq_mempool_range *range, uintptr_t item_offset)
250 {
251 char *init_p = NULL;
252 int i;
253
254 init_p = __rseq_pool_range_init_ptr(range, item_offset);
255 if (init_p)
256 bzero(init_p, pool->item_len);
257 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
258 char *p = __rseq_pool_range_percpu_ptr(range, i,
259 item_offset, pool->attr.stride);
260
261 /*
262 * If item is already zeroed, either because the
263 * init range update has propagated or because the
264 * content is already zeroed (e.g. zero page), don't
265 * write to the page. This eliminates useless COW over
266 * the zero page just for overwriting it with zeroes.
267 *
268 * This means zmalloc() in populate all policy pool do
269 * not trigger COW for CPUs which are not actively
270 * writing to the pool. This is however not the case for
271 * malloc_init() in populate-all pools if it populates
272 * non-zero content.
273 */
274 if (!rseq_cmp_item(p, pool->item_len, 0, NULL))
275 continue;
276 bzero(p, pool->item_len);
277 }
278 }
279
280 static
281 void rseq_percpu_init_item(struct rseq_mempool *pool,
282 struct rseq_mempool_range *range, uintptr_t item_offset,
283 void *init_ptr, size_t init_len)
284 {
285 char *init_p = NULL;
286 int i;
287
288 init_p = __rseq_pool_range_init_ptr(range, item_offset);
289 if (init_p)
290 memcpy(init_p, init_ptr, init_len);
291 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
292 char *p = __rseq_pool_range_percpu_ptr(range, i,
293 item_offset, pool->attr.stride);
294
295 /*
296 * If the update propagated through a shared mapping,
297 * or the item already has the correct content, skip
298 * writing it into the cpu item to eliminate useless
299 * COW of the page.
300 */
301 if (!memcmp(init_ptr, p, init_len))
302 continue;
303 memcpy(p, init_ptr, init_len);
304 }
305 }
306
307 static
308 void rseq_poison_item(void *p, size_t item_len, uintptr_t poison)
309 {
310 size_t offset;
311
312 for (offset = 0; offset < item_len; offset += sizeof(uintptr_t))
313 *((uintptr_t *) (p + offset)) = poison;
314 }
315
316 static
317 void rseq_percpu_poison_item(struct rseq_mempool *pool,
318 struct rseq_mempool_range *range, uintptr_t item_offset)
319 {
320 uintptr_t poison = pool->attr.poison;
321 char *init_p = NULL;
322 int i;
323
324 init_p = __rseq_pool_range_init_ptr(range, item_offset);
325 if (init_p)
326 rseq_poison_item(init_p, pool->item_len, poison);
327 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
328 char *p = __rseq_pool_range_percpu_ptr(range, i,
329 item_offset, pool->attr.stride);
330
331 /*
332 * If the update propagated through a shared mapping,
333 * or the item already has the correct content, skip
334 * writing it into the cpu item to eliminate useless
335 * COW of the page.
336 *
337 * It is recommended to use zero as poison value for
338 * populate-all pools to eliminate COW due to writing
339 * poison to unused CPU memory.
340 */
341 if (rseq_cmp_item(p, pool->item_len, poison, NULL) == 0)
342 continue;
343 rseq_poison_item(p, pool->item_len, poison);
344 }
345 }
346
347 /* Always inline for __builtin_return_address(0). */
348 static inline __attribute__((always_inline))
349 void rseq_check_poison_item(const struct rseq_mempool *pool, uintptr_t item_offset,
350 void *p, size_t item_len, uintptr_t poison)
351 {
352 intptr_t unexpected_value;
353
354 if (rseq_cmp_item(p, item_len, poison, &unexpected_value) == 0)
355 return;
356
357 fprintf(stderr, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
358 __func__, (unsigned long) unexpected_value, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0));
359 abort();
360 }
361
362 /* Always inline for __builtin_return_address(0). */
363 static inline __attribute__((always_inline))
364 void rseq_percpu_check_poison_item(const struct rseq_mempool *pool,
365 const struct rseq_mempool_range *range, uintptr_t item_offset)
366 {
367 uintptr_t poison = pool->attr.poison;
368 char *init_p;
369 int i;
370
371 if (!pool->attr.robust_set)
372 return;
373 init_p = __rseq_pool_range_init_ptr(range, item_offset);
374 if (init_p)
375 rseq_check_poison_item(pool, item_offset, init_p, pool->item_len, poison);
376 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
377 char *p = __rseq_pool_range_percpu_ptr(range, i,
378 item_offset, pool->attr.stride);
379 rseq_check_poison_item(pool, item_offset, p, pool->item_len, poison);
380 }
381 }
382
383 #ifdef HAVE_LIBNUMA
384 int rseq_mempool_range_init_numa(void *addr, size_t len, int cpu, int numa_flags)
385 {
386 unsigned long nr_pages, page_len;
387 int status[MOVE_PAGES_BATCH_SIZE];
388 int nodes[MOVE_PAGES_BATCH_SIZE];
389 void *pages[MOVE_PAGES_BATCH_SIZE];
390 long ret;
391
392 if (!numa_flags) {
393 errno = EINVAL;
394 return -1;
395 }
396 page_len = rseq_get_page_len();
397 nr_pages = len >> rseq_get_count_order_ulong(page_len);
398
399 nodes[0] = numa_node_of_cpu(cpu);
400 if (nodes[0] < 0)
401 return -1;
402
403 for (size_t k = 1; k < RSEQ_ARRAY_SIZE(nodes); ++k) {
404 nodes[k] = nodes[0];
405 }
406
407 for (unsigned long page = 0; page < nr_pages;) {
408
409 size_t max_k = RSEQ_ARRAY_SIZE(pages);
410 size_t left = nr_pages - page;
411
412 if (left < max_k) {
413 max_k = left;
414 }
415
416 for (size_t k = 0; k < max_k; ++k, ++page) {
417 pages[k] = addr + (page * page_len);
418 status[k] = -EPERM;
419 }
420
421 ret = move_pages(0, max_k, pages, nodes, status, numa_flags);
422
423 if (ret < 0)
424 return ret;
425
426 if (ret > 0) {
427 fprintf(stderr, "%lu pages were not migrated\n", ret);
428 for (size_t k = 0; k < max_k; ++k) {
429 if (status[k] < 0)
430 fprintf(stderr,
431 "Error while moving page %p to numa node %d: %u\n",
432 pages[k], nodes[k], -status[k]);
433 }
434 }
435 }
436 return 0;
437 }
438 #else
439 int rseq_mempool_range_init_numa(void *addr __attribute__((unused)),
440 size_t len __attribute__((unused)),
441 int cpu __attribute__((unused)),
442 int numa_flags __attribute__((unused)))
443 {
444 errno = ENOSYS;
445 return -1;
446 }
447 #endif
448
449 static
450 int create_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range)
451 {
452 size_t count;
453
454 count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
455
456 /*
457 * Not being able to create the validation bitmap is an error
458 * that needs to be reported.
459 */
460 range->alloc_bitmap = calloc(count, sizeof(unsigned long));
461 if (!range->alloc_bitmap)
462 return -1;
463 return 0;
464 }
465
466 static
467 bool percpu_addr_in_pool(const struct rseq_mempool *pool, void __rseq_percpu *_addr)
468 {
469 struct rseq_mempool_range *range;
470 void *addr = (void *) _addr;
471
472 for (range = pool->range_list; range; range = range->next) {
473 if (addr >= range->base && addr < range->base + range->next_unused)
474 return true;
475 }
476 return false;
477 }
478
479 /* Always inline for __builtin_return_address(0). */
480 static inline __attribute__((always_inline))
481 void check_free_list(const struct rseq_mempool *pool)
482 {
483 size_t total_item = 0, total_never_allocated = 0, total_freed = 0,
484 max_list_traversal = 0, traversal_iteration = 0;
485 struct rseq_mempool_range *range;
486
487 if (!pool->attr.robust_set)
488 return;
489
490 for (range = pool->range_list; range; range = range->next) {
491 total_item += pool->attr.stride >> pool->item_order;
492 total_never_allocated += (pool->attr.stride - range->next_unused) >> pool->item_order;
493 }
494 max_list_traversal = total_item - total_never_allocated;
495
496 for (struct free_list_node *node = pool->free_list_head, *prev = NULL;
497 node;
498 prev = node,
499 node = node->next) {
500
501 if (traversal_iteration >= max_list_traversal) {
502 fprintf(stderr, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n",
503 __func__, get_pool_name(pool), pool, __builtin_return_address(0));
504 abort();
505 }
506
507 /* Node is out of range. */
508 if (!percpu_addr_in_pool(pool, __rseq_free_list_to_percpu_ptr(pool, node))) {
509 if (prev)
510 fprintf(stderr, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
511 __func__, prev, node, get_pool_name(pool), pool, __builtin_return_address(0));
512 else
513 fprintf(stderr, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
514 __func__, node, get_pool_name(pool), pool, __builtin_return_address(0));
515 abort();
516 }
517
518 traversal_iteration++;
519 total_freed++;
520 }
521
522 if (total_never_allocated + total_freed != total_item) {
523 fprintf(stderr, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n",
524 __func__, get_pool_name(pool), pool, total_item, total_never_allocated, total_freed, __builtin_return_address(0));
525 abort();
526 }
527 }
528
529 /* Always inline for __builtin_return_address(0). */
530 static inline __attribute__((always_inline))
531 void check_range_poison(const struct rseq_mempool *pool,
532 const struct rseq_mempool_range *range)
533 {
534 size_t item_offset;
535
536 for (item_offset = 0; item_offset < range->next_unused;
537 item_offset += pool->item_len)
538 rseq_percpu_check_poison_item(pool, range, item_offset);
539 }
540
541 /* Always inline for __builtin_return_address(0). */
542 static inline __attribute__((always_inline))
543 void check_pool_poison(const struct rseq_mempool *pool)
544 {
545 struct rseq_mempool_range *range;
546
547 if (!pool->attr.robust_set)
548 return;
549 for (range = pool->range_list; range; range = range->next)
550 check_range_poison(pool, range);
551 }
552
553 /* Always inline for __builtin_return_address(0). */
554 static inline __attribute__((always_inline))
555 void destroy_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range)
556 {
557 unsigned long *bitmap = range->alloc_bitmap;
558 size_t count, total_leaks = 0;
559
560 if (!bitmap)
561 return;
562
563 count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
564
565 /* Assert that all items in the pool were freed. */
566 for (size_t k = 0; k < count; ++k)
567 total_leaks += rseq_hweight_ulong(bitmap[k]);
568 if (total_leaks) {
569 fprintf(stderr, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n",
570 __func__, get_pool_name(pool), pool, total_leaks, (void *) __builtin_return_address(0));
571 abort();
572 }
573
574 free(bitmap);
575 range->alloc_bitmap = NULL;
576 }
577
578 /* Always inline for __builtin_return_address(0). */
579 static inline __attribute__((always_inline))
580 int rseq_mempool_range_destroy(struct rseq_mempool *pool,
581 struct rseq_mempool_range *range)
582 {
583 destroy_alloc_bitmap(pool, range);
584
585 /* range is a header located one page before the aligned mapping. */
586 return munmap(range->mmap_addr, range->mmap_len);
587 }
588
589 /*
590 * Allocate a memory mapping aligned on @alignment, with an optional
591 * @pre_header before the mapping.
592 */
593 static
594 void *aligned_mmap_anonymous(size_t page_size, size_t len, size_t alignment,
595 void **pre_header, size_t pre_header_len)
596 {
597 size_t minimum_page_count, page_count, extra, total_allocate = 0;
598 int page_order;
599 void *ptr;
600
601 if (len < page_size || alignment < page_size ||
602 !is_pow2(alignment) || (len & (alignment - 1))) {
603 errno = EINVAL;
604 return NULL;
605 }
606 page_order = rseq_get_count_order_ulong(page_size);
607 if (page_order < 0) {
608 errno = EINVAL;
609 return NULL;
610 }
611 if (pre_header_len && (pre_header_len & (page_size - 1))) {
612 errno = EINVAL;
613 return NULL;
614 }
615
616 minimum_page_count = (pre_header_len + len) >> page_order;
617 page_count = (pre_header_len + len + alignment - page_size) >> page_order;
618
619 assert(page_count >= minimum_page_count);
620
621 ptr = mmap(NULL, page_count << page_order, PROT_READ | PROT_WRITE,
622 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
623 if (ptr == MAP_FAILED) {
624 ptr = NULL;
625 goto alloc_error;
626 }
627
628 total_allocate = page_count << page_order;
629
630 if (!(((uintptr_t) ptr + pre_header_len) & (alignment - 1))) {
631 /* Pointer is already aligned. ptr points to pre_header. */
632 goto out;
633 }
634
635 /* Unmap extra before. */
636 extra = offset_align((uintptr_t) ptr + pre_header_len, alignment);
637 assert(!(extra & (page_size - 1)));
638 if (munmap(ptr, extra)) {
639 perror("munmap");
640 abort();
641 }
642 total_allocate -= extra;
643 ptr += extra; /* ptr points to pre_header */
644 page_count -= extra >> page_order;
645 out:
646 assert(page_count >= minimum_page_count);
647
648 if (page_count > minimum_page_count) {
649 void *extra_ptr;
650
651 /* Unmap extra after. */
652 extra_ptr = ptr + (minimum_page_count << page_order);
653 extra = (page_count - minimum_page_count) << page_order;
654 if (munmap(extra_ptr, extra)) {
655 perror("munmap");
656 abort();
657 }
658 total_allocate -= extra;
659 }
660
661 assert(!(((uintptr_t)ptr + pre_header_len) & (alignment - 1)));
662 assert(total_allocate == len + pre_header_len);
663
664 alloc_error:
665 if (ptr) {
666 if (pre_header)
667 *pre_header = ptr;
668 ptr += pre_header_len;
669 }
670 return ptr;
671 }
672
673 static
674 int rseq_memfd_create_init(const char *poolname, size_t init_len)
675 {
676 int fd;
677 char buf[249]; /* Limit is 249 bytes. */
678 const char *name;
679
680 if (poolname) {
681 snprintf(buf, sizeof(buf), "%s:rseq-mempool", poolname);
682 name = buf;
683 } else {
684 name = "<anonymous>:rseq-mempool";
685 }
686
687 fd = memfd_create(name, MFD_CLOEXEC);
688 if (fd < 0) {
689 perror("memfd_create");
690 goto end;
691 }
692 if (ftruncate(fd, (off_t) init_len)) {
693 if (close(fd))
694 perror("close");
695 fd = -1;
696 goto end;
697 }
698 end:
699 return fd;
700 }
701
702 static
703 void rseq_memfd_close(int fd)
704 {
705 if (fd < 0)
706 return;
707 if (close(fd))
708 perror("close");
709 }
710
711 static
712 struct rseq_mempool_range *rseq_mempool_range_create(struct rseq_mempool *pool)
713 {
714 struct rseq_mempool_range *range;
715 unsigned long page_size;
716 void *header;
717 void *base;
718 size_t range_len; /* Range len excludes header. */
719 int memfd = -1;
720
721 if (pool->attr.max_nr_ranges &&
722 pool->nr_ranges >= pool->attr.max_nr_ranges) {
723 errno = ENOMEM;
724 return NULL;
725 }
726 page_size = rseq_get_page_len();
727
728 range_len = pool->attr.stride * pool->attr.max_nr_cpus;
729 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL)
730 range_len += pool->attr.stride; /* init values */
731 if (pool->attr.robust_set)
732 range_len += pool->attr.stride; /* free list */
733 base = aligned_mmap_anonymous(page_size, range_len,
734 pool->attr.stride, &header, page_size);
735 if (!base)
736 return NULL;
737 range = (struct rseq_mempool_range *) (base - RANGE_HEADER_OFFSET);
738 range->pool = pool;
739 range->header = header;
740 range->base = base;
741 range->mmap_addr = header;
742 range->mmap_len = page_size + range_len;
743
744 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL) {
745 range->init = base + (pool->attr.stride * pool->attr.max_nr_cpus);
746 /* Populate init values pages from memfd */
747 memfd = rseq_memfd_create_init(pool->name, pool->attr.stride);
748 if (memfd < 0)
749 goto error_alloc;
750 if (mmap(range->init, pool->attr.stride, PROT_READ | PROT_WRITE,
751 MAP_SHARED | MAP_FIXED, memfd, 0) != (void *) range->init) {
752 goto error_alloc;
753 }
754 assert(pool->attr.type == MEMPOOL_TYPE_PERCPU);
755 /*
756 * Map per-cpu memory as private COW mappings of init values.
757 */
758 {
759 int cpu;
760
761 for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) {
762 void *p = base + (pool->attr.stride * cpu);
763 size_t len = pool->attr.stride;
764
765 if (mmap(p, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED,
766 memfd, 0) != (void *) p) {
767 goto error_alloc;
768 }
769 }
770 }
771 /*
772 * The init values shared mapping should not be shared
773 * with the children processes across fork. Prevent the
774 * whole mapping from being used across fork.
775 */
776 if (madvise(range->mmap_addr, pool->mmap_len, MADV_DONTFORK))
777 goto error_alloc;
778 rseq_memfd_close(memfd);
779 memfd = -1;
780 }
781
782 if (pool->attr.robust_set) {
783 if (create_alloc_bitmap(pool, range))
784 goto error_alloc;
785 }
786 if (pool->attr.init_set) {
787 switch (pool->attr.type) {
788 case MEMPOOL_TYPE_GLOBAL:
789 if (pool->attr.init_func(pool->attr.init_priv,
790 base, pool->attr.stride, -1)) {
791 goto error_alloc;
792 }
793 break;
794 case MEMPOOL_TYPE_PERCPU:
795 {
796 int cpu;
797 for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) {
798 if (pool->attr.init_func(pool->attr.init_priv,
799 base + (pool->attr.stride * cpu),
800 pool->attr.stride, cpu)) {
801 goto error_alloc;
802 }
803 }
804 break;
805 }
806 default:
807 abort();
808 }
809 }
810 pool->nr_ranges++;
811 return range;
812
813 error_alloc:
814 rseq_memfd_close(memfd);
815 (void) rseq_mempool_range_destroy(pool, range);
816 return NULL;
817 }
818
819 int rseq_mempool_destroy(struct rseq_mempool *pool)
820 {
821 struct rseq_mempool_range *range, *next_range;
822 int ret = 0;
823
824 if (!pool)
825 return 0;
826 check_free_list(pool);
827 check_pool_poison(pool);
828 /* Iteration safe against removal. */
829 for (range = pool->range_list; range && (next_range = range->next, 1); range = next_range) {
830 if (rseq_mempool_range_destroy(pool, range))
831 goto end;
832 /* Update list head to keep list coherent in case of partial failure. */
833 pool->range_list = next_range;
834 }
835 pthread_mutex_destroy(&pool->lock);
836 free(pool->name);
837 free(pool);
838 end:
839 return ret;
840 }
841
842 struct rseq_mempool *rseq_mempool_create(const char *pool_name,
843 size_t item_len, const struct rseq_mempool_attr *_attr)
844 {
845 struct rseq_mempool *pool;
846 struct rseq_mempool_attr attr = {};
847 int order;
848
849 /* Make sure each item is large enough to contain free list pointers. */
850 if (item_len < sizeof(void *))
851 item_len = sizeof(void *);
852
853 /* Align item_len on next power of two. */
854 order = rseq_get_count_order_ulong(item_len);
855 if (order < 0) {
856 errno = EINVAL;
857 return NULL;
858 }
859 item_len = 1UL << order;
860
861 if (_attr)
862 memcpy(&attr, _attr, sizeof(attr));
863
864 switch (attr.type) {
865 case MEMPOOL_TYPE_PERCPU:
866 if (attr.max_nr_cpus < 0) {
867 errno = EINVAL;
868 return NULL;
869 }
870 if (attr.max_nr_cpus == 0) {
871 /* Auto-detect */
872 attr.max_nr_cpus = rseq_get_max_nr_cpus();
873 if (attr.max_nr_cpus == 0) {
874 errno = EINVAL;
875 return NULL;
876 }
877 }
878 break;
879 case MEMPOOL_TYPE_GLOBAL:
880 /* Override populate policy for global type. */
881 if (attr.populate_policy == RSEQ_MEMPOOL_POPULATE_PRIVATE_NONE)
882 attr.populate_policy = RSEQ_MEMPOOL_POPULATE_PRIVATE_ALL;
883 /* Use a 1-cpu pool for global mempool type. */
884 attr.max_nr_cpus = 1;
885 break;
886 }
887 if (!attr.stride)
888 attr.stride = RSEQ_MEMPOOL_STRIDE; /* Use default */
889 if (attr.robust_set && !attr.poison_set) {
890 attr.poison_set = true;
891 attr.poison = DEFAULT_PRIVATE_POISON_VALUE;
892 }
893 if (item_len > attr.stride || attr.stride < (size_t) rseq_get_page_len() ||
894 !is_pow2(attr.stride)) {
895 errno = EINVAL;
896 return NULL;
897 }
898
899 pool = calloc(1, sizeof(struct rseq_mempool));
900 if (!pool)
901 return NULL;
902
903 memcpy(&pool->attr, &attr, sizeof(attr));
904 pthread_mutex_init(&pool->lock, NULL);
905 pool->item_len = item_len;
906 pool->item_order = order;
907
908 pool->range_list = rseq_mempool_range_create(pool);
909 if (!pool->range_list)
910 goto error_alloc;
911
912 if (pool_name) {
913 pool->name = strdup(pool_name);
914 if (!pool->name)
915 goto error_alloc;
916 }
917 return pool;
918
919 error_alloc:
920 rseq_mempool_destroy(pool);
921 errno = ENOMEM;
922 return NULL;
923 }
924
925 /* Always inline for __builtin_return_address(0). */
926 static inline __attribute__((always_inline))
927 void set_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, size_t item_offset)
928 {
929 unsigned long *bitmap = range->alloc_bitmap;
930 size_t item_index = item_offset >> pool->item_order;
931 unsigned long mask;
932 size_t k;
933
934 if (!bitmap)
935 return;
936
937 k = item_index / BIT_PER_ULONG;
938 mask = 1ULL << (item_index % BIT_PER_ULONG);
939
940 /* Print error if bit is already set. */
941 if (bitmap[k] & mask) {
942 fprintf(stderr, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
943 __func__, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0));
944 abort();
945 }
946 bitmap[k] |= mask;
947 }
948
949 static
950 void __rseq_percpu *__rseq_percpu_malloc(struct rseq_mempool *pool,
951 bool zeroed, void *init_ptr, size_t init_len)
952 {
953 struct rseq_mempool_range *range;
954 struct free_list_node *node;
955 uintptr_t item_offset;
956 void __rseq_percpu *addr;
957
958 if (init_len > pool->item_len) {
959 errno = EINVAL;
960 return NULL;
961 }
962 pthread_mutex_lock(&pool->lock);
963 /* Get first entry from free list. */
964 node = pool->free_list_head;
965 if (node != NULL) {
966 void *range_base, *ptr;
967
968 ptr = __rseq_free_list_to_percpu_ptr(pool, node);
969 range_base = (void *) ((uintptr_t) ptr & (~(pool->attr.stride - 1)));
970 range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET);
971 /* Remove node from free list (update head). */
972 pool->free_list_head = node->next;
973 item_offset = (uintptr_t) (ptr - range_base);
974 rseq_percpu_check_poison_item(pool, range, item_offset);
975 addr = __rseq_free_list_to_percpu_ptr(pool, node);
976 goto end;
977 }
978 /*
979 * If the most recent range (first in list) does not have any
980 * room left, create a new range and prepend it to the list
981 * head.
982 */
983 range = pool->range_list;
984 if (range->next_unused + pool->item_len > pool->attr.stride) {
985 range = rseq_mempool_range_create(pool);
986 if (!range) {
987 errno = ENOMEM;
988 addr = NULL;
989 goto end;
990 }
991 /* Add range to head of list. */
992 range->next = pool->range_list;
993 pool->range_list = range;
994 }
995 /* First range in list has room left. */
996 item_offset = range->next_unused;
997 addr = (void __rseq_percpu *) (range->base + item_offset);
998 range->next_unused += pool->item_len;
999 end:
1000 if (addr)
1001 set_alloc_slot(pool, range, item_offset);
1002 pthread_mutex_unlock(&pool->lock);
1003 if (addr) {
1004 if (zeroed)
1005 rseq_percpu_zero_item(pool, range, item_offset);
1006 else if (init_ptr) {
1007 rseq_percpu_init_item(pool, range, item_offset,
1008 init_ptr, init_len);
1009 }
1010 }
1011 return addr;
1012 }
1013
1014 void __rseq_percpu *rseq_mempool_percpu_malloc(struct rseq_mempool *pool)
1015 {
1016 return __rseq_percpu_malloc(pool, false, NULL, 0);
1017 }
1018
1019 void __rseq_percpu *rseq_mempool_percpu_zmalloc(struct rseq_mempool *pool)
1020 {
1021 return __rseq_percpu_malloc(pool, true, NULL, 0);
1022 }
1023
1024 void __rseq_percpu *rseq_mempool_percpu_malloc_init(struct rseq_mempool *pool,
1025 void *init_ptr, size_t len)
1026 {
1027 return __rseq_percpu_malloc(pool, false, init_ptr, len);
1028 }
1029
1030 /* Always inline for __builtin_return_address(0). */
1031 static inline __attribute__((always_inline))
1032 void clear_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, size_t item_offset)
1033 {
1034 unsigned long *bitmap = range->alloc_bitmap;
1035 size_t item_index = item_offset >> pool->item_order;
1036 unsigned long mask;
1037 size_t k;
1038
1039 if (!bitmap)
1040 return;
1041
1042 k = item_index / BIT_PER_ULONG;
1043 mask = 1ULL << (item_index % BIT_PER_ULONG);
1044
1045 /* Print error if bit is not set. */
1046 if (!(bitmap[k] & mask)) {
1047 fprintf(stderr, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
1048 __func__, get_pool_name(pool), pool, item_offset,
1049 (void *) __builtin_return_address(0));
1050 abort();
1051 }
1052 bitmap[k] &= ~mask;
1053 }
1054
1055 void librseq_mempool_percpu_free(void __rseq_percpu *_ptr, size_t stride)
1056 {
1057 uintptr_t ptr = (uintptr_t) _ptr;
1058 void *range_base = (void *) (ptr & (~(stride - 1)));
1059 struct rseq_mempool_range *range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET);
1060 struct rseq_mempool *pool = range->pool;
1061 uintptr_t item_offset = ptr & (stride - 1);
1062 struct free_list_node *head, *item;
1063
1064 pthread_mutex_lock(&pool->lock);
1065 clear_alloc_slot(pool, range, item_offset);
1066 /* Add ptr to head of free list */
1067 head = pool->free_list_head;
1068 if (pool->attr.poison_set)
1069 rseq_percpu_poison_item(pool, range, item_offset);
1070 item = __rseq_percpu_to_free_list_ptr(pool, _ptr);
1071 /*
1072 * Setting the next pointer will overwrite the first uintptr_t
1073 * poison for either CPU 0 (populate all) or init data (populate
1074 * none).
1075 */
1076 item->next = head;
1077 pool->free_list_head = item;
1078 pthread_mutex_unlock(&pool->lock);
1079 }
1080
1081 struct rseq_mempool_set *rseq_mempool_set_create(void)
1082 {
1083 struct rseq_mempool_set *pool_set;
1084
1085 pool_set = calloc(1, sizeof(struct rseq_mempool_set));
1086 if (!pool_set)
1087 return NULL;
1088 pthread_mutex_init(&pool_set->lock, NULL);
1089 return pool_set;
1090 }
1091
1092 int rseq_mempool_set_destroy(struct rseq_mempool_set *pool_set)
1093 {
1094 int order, ret;
1095
1096 for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) {
1097 struct rseq_mempool *pool = pool_set->entries[order];
1098
1099 if (!pool)
1100 continue;
1101 ret = rseq_mempool_destroy(pool);
1102 if (ret)
1103 return ret;
1104 pool_set->entries[order] = NULL;
1105 }
1106 pthread_mutex_destroy(&pool_set->lock);
1107 free(pool_set);
1108 return 0;
1109 }
1110
1111 /* Ownership of pool is handed over to pool set on success. */
1112 int rseq_mempool_set_add_pool(struct rseq_mempool_set *pool_set, struct rseq_mempool *pool)
1113 {
1114 size_t item_order = pool->item_order;
1115 int ret = 0;
1116
1117 pthread_mutex_lock(&pool_set->lock);
1118 if (pool_set->entries[item_order]) {
1119 errno = EBUSY;
1120 ret = -1;
1121 goto end;
1122 }
1123 pool_set->entries[pool->item_order] = pool;
1124 end:
1125 pthread_mutex_unlock(&pool_set->lock);
1126 return ret;
1127 }
1128
1129 static
1130 void __rseq_percpu *__rseq_mempool_set_malloc(struct rseq_mempool_set *pool_set,
1131 void *init_ptr, size_t len, bool zeroed)
1132 {
1133 int order, min_order = POOL_SET_MIN_ENTRY;
1134 struct rseq_mempool *pool;
1135 void __rseq_percpu *addr;
1136
1137 order = rseq_get_count_order_ulong(len);
1138 if (order > POOL_SET_MIN_ENTRY)
1139 min_order = order;
1140 again:
1141 pthread_mutex_lock(&pool_set->lock);
1142 /* First smallest present pool where @len fits. */
1143 for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) {
1144 pool = pool_set->entries[order];
1145
1146 if (!pool)
1147 continue;
1148 if (pool->item_len >= len)
1149 goto found;
1150 }
1151 pool = NULL;
1152 found:
1153 pthread_mutex_unlock(&pool_set->lock);
1154 if (pool) {
1155 addr = __rseq_percpu_malloc(pool, zeroed, init_ptr, len);
1156 if (addr == NULL && errno == ENOMEM) {
1157 /*
1158 * If the allocation failed, try again with a
1159 * larger pool.
1160 */
1161 min_order = order + 1;
1162 goto again;
1163 }
1164 } else {
1165 /* Not found. */
1166 errno = ENOMEM;
1167 addr = NULL;
1168 }
1169 return addr;
1170 }
1171
1172 void __rseq_percpu *rseq_mempool_set_percpu_malloc(struct rseq_mempool_set *pool_set, size_t len)
1173 {
1174 return __rseq_mempool_set_malloc(pool_set, NULL, len, false);
1175 }
1176
1177 void __rseq_percpu *rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set *pool_set, size_t len)
1178 {
1179 return __rseq_mempool_set_malloc(pool_set, NULL, len, true);
1180 }
1181
1182 void __rseq_percpu *rseq_mempool_set_percpu_malloc_init(struct rseq_mempool_set *pool_set,
1183 void *init_ptr, size_t len)
1184 {
1185 return __rseq_mempool_set_malloc(pool_set, init_ptr, len, true);
1186 }
1187
1188 struct rseq_mempool_attr *rseq_mempool_attr_create(void)
1189 {
1190 return calloc(1, sizeof(struct rseq_mempool_attr));
1191 }
1192
1193 void rseq_mempool_attr_destroy(struct rseq_mempool_attr *attr)
1194 {
1195 free(attr);
1196 }
1197
1198 int rseq_mempool_attr_set_init(struct rseq_mempool_attr *attr,
1199 int (*init_func)(void *priv, void *addr, size_t len, int cpu),
1200 void *init_priv)
1201 {
1202 if (!attr) {
1203 errno = EINVAL;
1204 return -1;
1205 }
1206 attr->init_set = true;
1207 attr->init_func = init_func;
1208 attr->init_priv = init_priv;
1209 return 0;
1210 }
1211
1212 int rseq_mempool_attr_set_robust(struct rseq_mempool_attr *attr)
1213 {
1214 if (!attr) {
1215 errno = EINVAL;
1216 return -1;
1217 }
1218 attr->robust_set = true;
1219 return 0;
1220 }
1221
1222 int rseq_mempool_attr_set_percpu(struct rseq_mempool_attr *attr,
1223 size_t stride, int max_nr_cpus)
1224 {
1225 if (!attr) {
1226 errno = EINVAL;
1227 return -1;
1228 }
1229 attr->type = MEMPOOL_TYPE_PERCPU;
1230 attr->stride = stride;
1231 attr->max_nr_cpus = max_nr_cpus;
1232 return 0;
1233 }
1234
1235 int rseq_mempool_attr_set_global(struct rseq_mempool_attr *attr,
1236 size_t stride)
1237 {
1238 if (!attr) {
1239 errno = EINVAL;
1240 return -1;
1241 }
1242 attr->type = MEMPOOL_TYPE_GLOBAL;
1243 attr->stride = stride;
1244 attr->max_nr_cpus = 0;
1245 return 0;
1246 }
1247
1248 int rseq_mempool_attr_set_max_nr_ranges(struct rseq_mempool_attr *attr,
1249 unsigned long max_nr_ranges)
1250 {
1251 if (!attr) {
1252 errno = EINVAL;
1253 return -1;
1254 }
1255 attr->max_nr_ranges = max_nr_ranges;
1256 return 0;
1257 }
1258
1259 int rseq_mempool_attr_set_poison(struct rseq_mempool_attr *attr,
1260 uintptr_t poison)
1261 {
1262 if (!attr) {
1263 errno = EINVAL;
1264 return -1;
1265 }
1266 attr->poison_set = true;
1267 attr->poison = poison;
1268 return 0;
1269 }
1270
1271 int rseq_mempool_attr_set_populate_policy(struct rseq_mempool_attr *attr,
1272 enum rseq_mempool_populate_policy policy)
1273 {
1274 if (!attr) {
1275 errno = EINVAL;
1276 return -1;
1277 }
1278 attr->populate_policy = policy;
1279 return 0;
1280 }
1281
1282 int rseq_mempool_get_max_nr_cpus(struct rseq_mempool *mempool)
1283 {
1284 if (!mempool || mempool->attr.type != MEMPOOL_TYPE_PERCPU) {
1285 errno = EINVAL;
1286 return -1;
1287 }
1288 return mempool->attr.max_nr_cpus;
1289 }
This page took 0.094914 seconds and 5 git commands to generate.