fix: handle EINTR correctly in get_cpu_mask_from_sysfs
[librseq.git] / src / rseq-mempool.c
... / ...
CommitLineData
1// SPDX-License-Identifier: MIT
2// SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3// SPDX-FileCopyrightText: 2024 Olivier Dion <odion@efficios.com>
4
5#include <rseq/mempool.h>
6#include <sys/mman.h>
7#include <assert.h>
8#include <string.h>
9#include <pthread.h>
10#include <unistd.h>
11#include <stdlib.h>
12#include <rseq/compiler.h>
13#include <errno.h>
14#include <stdint.h>
15#include <stdbool.h>
16#include <stdio.h>
17#include <fcntl.h>
18
19#ifdef HAVE_LIBNUMA
20# include <numa.h>
21# include <numaif.h>
22#endif
23
24#include "rseq-utils.h"
25#include "list.h"
26#include <rseq/rseq.h>
27
28/*
29 * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator.
30 *
31 * The rseq per-CPU memory allocator allows the application the request
32 * memory pools of CPU-Local memory each of containing objects of a
33 * given size (rounded to next power of 2), reserving a given virtual
34 * address size per CPU, for a given maximum number of CPUs.
35 *
36 * The per-CPU memory allocator is analogous to TLS (Thread-Local
37 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
38 * memory allocator provides CPU-Local Storage.
39 */
40
41#define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG
42
43#define POOL_HEADER_NR_PAGES 2
44
45/*
46 * Smallest allocation should hold enough space for a free list pointer.
47 */
48#if RSEQ_BITS_PER_LONG == 64
49# define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
50#else
51# define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
52#endif
53
54#define BIT_PER_ULONG (8 * sizeof(unsigned long))
55
56#define MOVE_PAGES_BATCH_SIZE 4096
57
58#define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range)
59
60#if RSEQ_BITS_PER_LONG == 64
61# define DEFAULT_COW_INIT_POISON_VALUE 0x5555555555555555ULL
62#else
63# define DEFAULT_COW_INIT_POISON_VALUE 0x55555555UL
64#endif
65
66/*
67 * Define the default COW_ZERO poison value as zero to prevent useless
68 * COW page allocation when writing poison values when freeing items.
69 */
70#define DEFAULT_COW_ZERO_POISON_VALUE 0x0
71
72struct free_list_node;
73
74struct free_list_node {
75 struct free_list_node *next;
76};
77
78enum mempool_type {
79 MEMPOOL_TYPE_PERCPU = 0, /* Default */
80 MEMPOOL_TYPE_GLOBAL = 1,
81};
82
83struct rseq_mempool_attr {
84 bool init_set;
85 int (*init_func)(void *priv, void *addr, size_t len, int cpu);
86 void *init_priv;
87
88 bool robust_set;
89
90 enum mempool_type type;
91 size_t stride;
92 int max_nr_cpus;
93
94 unsigned long max_nr_ranges;
95
96 bool poison_set;
97 uintptr_t poison;
98
99 enum rseq_mempool_populate_policy populate_policy;
100};
101
102struct rseq_mempool_range;
103
104struct rseq_mempool_range {
105 struct list_head node; /* Linked list of ranges. */
106 struct rseq_mempool *pool; /* Backward reference to container pool. */
107
108 /*
109 * Memory layout of a mempool range:
110 * - Canary header page (for detection of destroy-after-fork of
111 * COW_INIT pool),
112 * - Header page (contains struct rseq_mempool_range at the
113 * very end),
114 * - Base of the per-cpu data, starting with CPU 0.
115 * Aliases with free-list for non-robust COW_ZERO pool.
116 * - CPU 1,
117 * ...
118 * - CPU max_nr_cpus - 1
119 * - init values (only allocated for COW_INIT pool).
120 * Aliases with free-list for non-robust COW_INIT pool.
121 * - free list (for robust pool).
122 *
123 * The free list aliases the CPU 0 memory area for non-robust
124 * COW_ZERO pools. It aliases with init values for non-robust
125 * COW_INIT pools. It is located immediately after the init
126 * values for robust pools.
127 */
128 void *header;
129 void *base;
130 /*
131 * The init values contains malloc_init/zmalloc values.
132 * Pointer is NULL for RSEQ_MEMPOOL_POPULATE_COW_ZERO.
133 */
134 void *init;
135 size_t next_unused;
136
137 /* Pool range mmap/munmap */
138 void *mmap_addr;
139 size_t mmap_len;
140
141 /* Track alloc/free. */
142 unsigned long *alloc_bitmap;
143};
144
145struct rseq_mempool {
146 struct list_head range_list; /* Head of ranges linked-list. */
147 unsigned long nr_ranges;
148
149 size_t item_len;
150 int item_order;
151
152 /*
153 * COW_INIT non-robust pools:
154 * The free list chains freed items on the init
155 * values address range.
156 *
157 * COW_ZERO non-robust pools:
158 * The free list chains freed items on the CPU 0
159 * address range. We should rethink this
160 * decision if false sharing between malloc/free
161 * from other CPUs and data accesses from CPU 0
162 * becomes an issue.
163 *
164 * Robust pools: The free list chains freed items in the
165 * address range dedicated for the free list.
166 *
167 * This is a NULL-terminated singly-linked list.
168 */
169 struct free_list_node *free_list_head;
170
171 /* This lock protects allocation/free within the pool. */
172 pthread_mutex_t lock;
173
174 struct rseq_mempool_attr attr;
175 char *name;
176};
177
178/*
179 * Pool set entries are indexed by item_len rounded to the next power of
180 * 2. A pool set can contain NULL pool entries, in which case the next
181 * large enough entry will be used for allocation.
182 */
183struct rseq_mempool_set {
184 /* This lock protects add vs malloc/zmalloc within the pool set. */
185 pthread_mutex_t lock;
186 struct rseq_mempool *entries[POOL_SET_NR_ENTRIES];
187};
188
189static
190const char *get_pool_name(const struct rseq_mempool *pool)
191{
192 return pool->name ? : "<anonymous>";
193}
194
195static
196void *__rseq_pool_range_percpu_ptr(const struct rseq_mempool_range *range, int cpu,
197 uintptr_t item_offset, size_t stride)
198{
199 return range->base + (stride * cpu) + item_offset;
200}
201
202static
203void *__rseq_pool_range_init_ptr(const struct rseq_mempool_range *range,
204 uintptr_t item_offset)
205{
206 if (!range->init)
207 return NULL;
208 return range->init + item_offset;
209}
210
211static
212void __rseq_percpu *__rseq_free_list_to_percpu_ptr(const struct rseq_mempool *pool,
213 struct free_list_node *node)
214{
215 void __rseq_percpu *p = (void __rseq_percpu *) node;
216
217 if (pool->attr.robust_set) {
218 /* Skip cpus. */
219 p -= pool->attr.max_nr_cpus * pool->attr.stride;
220 /* Skip init values */
221 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT)
222 p -= pool->attr.stride;
223
224 } else {
225 /* COW_INIT free list is in init values */
226 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT)
227 p -= pool->attr.max_nr_cpus * pool->attr.stride;
228 }
229 return p;
230}
231
232static
233struct free_list_node *__rseq_percpu_to_free_list_ptr(const struct rseq_mempool *pool,
234 void __rseq_percpu *p)
235{
236 if (pool->attr.robust_set) {
237 /* Skip cpus. */
238 p += pool->attr.max_nr_cpus * pool->attr.stride;
239 /* Skip init values */
240 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT)
241 p += pool->attr.stride;
242
243 } else {
244 /* COW_INIT free list is in init values */
245 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT)
246 p += pool->attr.max_nr_cpus * pool->attr.stride;
247 }
248 return (struct free_list_node *) p;
249}
250
251static
252intptr_t rseq_cmp_item(void *p, size_t item_len, intptr_t cmp_value, intptr_t *unexpected_value)
253{
254 size_t offset;
255 intptr_t res = 0;
256
257 for (offset = 0; offset < item_len; offset += sizeof(uintptr_t)) {
258 intptr_t v = *((intptr_t *) (p + offset));
259
260 if ((res = v - cmp_value) != 0) {
261 if (unexpected_value)
262 *unexpected_value = v;
263 break;
264 }
265 }
266 return res;
267}
268
269static
270void rseq_percpu_zero_item(struct rseq_mempool *pool,
271 struct rseq_mempool_range *range, uintptr_t item_offset)
272{
273 char *init_p = NULL;
274 int i;
275
276 init_p = __rseq_pool_range_init_ptr(range, item_offset);
277 if (init_p)
278 bzero(init_p, pool->item_len);
279 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
280 char *p = __rseq_pool_range_percpu_ptr(range, i,
281 item_offset, pool->attr.stride);
282
283 /*
284 * If item is already zeroed, either because the
285 * init range update has propagated or because the
286 * content is already zeroed (e.g. zero page), don't
287 * write to the page. This eliminates useless COW over
288 * the zero page just for overwriting it with zeroes.
289 *
290 * This means zmalloc() in COW_ZERO policy pool do
291 * not trigger COW for CPUs which are not actively
292 * writing to the pool. This is however not the case for
293 * malloc_init() in populate-all pools if it populates
294 * non-zero content.
295 */
296 if (!rseq_cmp_item(p, pool->item_len, 0, NULL))
297 continue;
298 bzero(p, pool->item_len);
299 }
300}
301
302static
303void rseq_percpu_init_item(struct rseq_mempool *pool,
304 struct rseq_mempool_range *range, uintptr_t item_offset,
305 void *init_ptr, size_t init_len)
306{
307 char *init_p = NULL;
308 int i;
309
310 init_p = __rseq_pool_range_init_ptr(range, item_offset);
311 if (init_p)
312 memcpy(init_p, init_ptr, init_len);
313 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
314 char *p = __rseq_pool_range_percpu_ptr(range, i,
315 item_offset, pool->attr.stride);
316
317 /*
318 * If the update propagated through a shared mapping,
319 * or the item already has the correct content, skip
320 * writing it into the cpu item to eliminate useless
321 * COW of the page.
322 */
323 if (!memcmp(init_ptr, p, init_len))
324 continue;
325 memcpy(p, init_ptr, init_len);
326 }
327}
328
329static
330void rseq_poison_item(void *p, size_t item_len, uintptr_t poison)
331{
332 size_t offset;
333
334 for (offset = 0; offset < item_len; offset += sizeof(uintptr_t))
335 *((uintptr_t *) (p + offset)) = poison;
336}
337
338static
339void rseq_percpu_poison_item(struct rseq_mempool *pool,
340 struct rseq_mempool_range *range, uintptr_t item_offset)
341{
342 uintptr_t poison = pool->attr.poison;
343 char *init_p = NULL;
344 int i;
345
346 init_p = __rseq_pool_range_init_ptr(range, item_offset);
347 if (init_p)
348 rseq_poison_item(init_p, pool->item_len, poison);
349 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
350 char *p = __rseq_pool_range_percpu_ptr(range, i,
351 item_offset, pool->attr.stride);
352
353 /*
354 * If the update propagated through a shared mapping,
355 * or the item already has the correct content, skip
356 * writing it into the cpu item to eliminate useless
357 * COW of the page.
358 *
359 * It is recommended to use zero as poison value for
360 * COW_ZERO pools to eliminate COW due to writing
361 * poison to CPU memory still backed by the zero page.
362 */
363 if (rseq_cmp_item(p, pool->item_len, poison, NULL) == 0)
364 continue;
365 rseq_poison_item(p, pool->item_len, poison);
366 }
367}
368
369/* Always inline for __builtin_return_address(0). */
370static inline __attribute__((always_inline))
371void rseq_check_poison_item(const struct rseq_mempool *pool, uintptr_t item_offset,
372 void *p, size_t item_len, uintptr_t poison)
373{
374 intptr_t unexpected_value;
375
376 if (rseq_cmp_item(p, item_len, poison, &unexpected_value) == 0)
377 return;
378
379 fprintf(stderr, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
380 __func__, (unsigned long) unexpected_value, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0));
381 abort();
382}
383
384/* Always inline for __builtin_return_address(0). */
385static inline __attribute__((always_inline))
386void rseq_percpu_check_poison_item(const struct rseq_mempool *pool,
387 const struct rseq_mempool_range *range, uintptr_t item_offset)
388{
389 uintptr_t poison = pool->attr.poison;
390 char *init_p;
391 int i;
392
393 if (!pool->attr.robust_set)
394 return;
395 init_p = __rseq_pool_range_init_ptr(range, item_offset);
396 if (init_p)
397 rseq_check_poison_item(pool, item_offset, init_p, pool->item_len, poison);
398 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
399 char *p = __rseq_pool_range_percpu_ptr(range, i,
400 item_offset, pool->attr.stride);
401 rseq_check_poison_item(pool, item_offset, p, pool->item_len, poison);
402 }
403}
404
405#ifdef HAVE_LIBNUMA
406int rseq_mempool_range_init_numa(void *addr, size_t len, int cpu, int numa_flags)
407{
408 unsigned long nr_pages, page_len;
409 int status[MOVE_PAGES_BATCH_SIZE];
410 int nodes[MOVE_PAGES_BATCH_SIZE];
411 void *pages[MOVE_PAGES_BATCH_SIZE];
412 long ret;
413
414 if (!numa_flags) {
415 errno = EINVAL;
416 return -1;
417 }
418 page_len = rseq_get_page_len();
419 nr_pages = len >> rseq_get_count_order_ulong(page_len);
420
421 nodes[0] = numa_node_of_cpu(cpu);
422 if (nodes[0] < 0)
423 return -1;
424
425 for (size_t k = 1; k < RSEQ_ARRAY_SIZE(nodes); ++k) {
426 nodes[k] = nodes[0];
427 }
428
429 for (unsigned long page = 0; page < nr_pages;) {
430
431 size_t max_k = RSEQ_ARRAY_SIZE(pages);
432 size_t left = nr_pages - page;
433
434 if (left < max_k) {
435 max_k = left;
436 }
437
438 for (size_t k = 0; k < max_k; ++k, ++page) {
439 pages[k] = addr + (page * page_len);
440 status[k] = -EPERM;
441 }
442
443 ret = move_pages(0, max_k, pages, nodes, status, numa_flags);
444
445 if (ret < 0)
446 return ret;
447
448 if (ret > 0) {
449 fprintf(stderr, "%lu pages were not migrated\n", ret);
450 for (size_t k = 0; k < max_k; ++k) {
451 if (status[k] < 0)
452 fprintf(stderr,
453 "Error while moving page %p to numa node %d: %u\n",
454 pages[k], nodes[k], -status[k]);
455 }
456 }
457 }
458 return 0;
459}
460#else
461int rseq_mempool_range_init_numa(void *addr __attribute__((unused)),
462 size_t len __attribute__((unused)),
463 int cpu __attribute__((unused)),
464 int numa_flags __attribute__((unused)))
465{
466 errno = ENOSYS;
467 return -1;
468}
469#endif
470
471static
472int create_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range)
473{
474 size_t count;
475
476 count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
477
478 /*
479 * Not being able to create the validation bitmap is an error
480 * that needs to be reported.
481 */
482 range->alloc_bitmap = calloc(count, sizeof(unsigned long));
483 if (!range->alloc_bitmap)
484 return -1;
485 return 0;
486}
487
488static
489bool percpu_addr_in_pool(const struct rseq_mempool *pool, void __rseq_percpu *_addr)
490{
491 struct rseq_mempool_range *range;
492 void *addr = (void *) _addr;
493
494 list_for_each_entry(range, &pool->range_list, node) {
495 if (addr >= range->base && addr < range->base + range->next_unused)
496 return true;
497 }
498 return false;
499}
500
501/* Always inline for __builtin_return_address(0). */
502static inline __attribute__((always_inline))
503void check_free_list(const struct rseq_mempool *pool, bool mapping_accessible)
504{
505 size_t total_item = 0, total_never_allocated = 0, total_freed = 0,
506 max_list_traversal = 0, traversal_iteration = 0;
507 struct rseq_mempool_range *range;
508
509 if (!pool->attr.robust_set || !mapping_accessible)
510 return;
511
512 list_for_each_entry(range, &pool->range_list, node) {
513 total_item += pool->attr.stride >> pool->item_order;
514 total_never_allocated += (pool->attr.stride - range->next_unused) >> pool->item_order;
515 }
516 max_list_traversal = total_item - total_never_allocated;
517
518 for (struct free_list_node *node = pool->free_list_head, *prev = NULL;
519 node;
520 prev = node,
521 node = node->next) {
522
523 if (traversal_iteration >= max_list_traversal) {
524 fprintf(stderr, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n",
525 __func__, get_pool_name(pool), pool, __builtin_return_address(0));
526 abort();
527 }
528
529 /* Node is out of range. */
530 if (!percpu_addr_in_pool(pool, __rseq_free_list_to_percpu_ptr(pool, node))) {
531 if (prev)
532 fprintf(stderr, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
533 __func__, prev, node, get_pool_name(pool), pool, __builtin_return_address(0));
534 else
535 fprintf(stderr, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
536 __func__, node, get_pool_name(pool), pool, __builtin_return_address(0));
537 abort();
538 }
539
540 traversal_iteration++;
541 total_freed++;
542 }
543
544 if (total_never_allocated + total_freed != total_item) {
545 fprintf(stderr, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n",
546 __func__, get_pool_name(pool), pool, total_item, total_never_allocated, total_freed, __builtin_return_address(0));
547 abort();
548 }
549}
550
551/* Always inline for __builtin_return_address(0). */
552static inline __attribute__((always_inline))
553void check_range_poison(const struct rseq_mempool *pool,
554 const struct rseq_mempool_range *range)
555{
556 size_t item_offset;
557
558 for (item_offset = 0; item_offset < range->next_unused;
559 item_offset += pool->item_len)
560 rseq_percpu_check_poison_item(pool, range, item_offset);
561}
562
563/* Always inline for __builtin_return_address(0). */
564static inline __attribute__((always_inline))
565void check_pool_poison(const struct rseq_mempool *pool, bool mapping_accessible)
566{
567 struct rseq_mempool_range *range;
568
569 if (!pool->attr.robust_set || !mapping_accessible)
570 return;
571 list_for_each_entry(range, &pool->range_list, node)
572 check_range_poison(pool, range);
573}
574
575/* Always inline for __builtin_return_address(0). */
576static inline __attribute__((always_inline))
577void destroy_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range)
578{
579 unsigned long *bitmap = range->alloc_bitmap;
580 size_t count, total_leaks = 0;
581
582 if (!bitmap)
583 return;
584
585 count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
586
587 /* Assert that all items in the pool were freed. */
588 for (size_t k = 0; k < count; ++k)
589 total_leaks += rseq_hweight_ulong(bitmap[k]);
590 if (total_leaks) {
591 fprintf(stderr, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n",
592 __func__, get_pool_name(pool), pool, total_leaks, (void *) __builtin_return_address(0));
593 abort();
594 }
595
596 free(bitmap);
597 range->alloc_bitmap = NULL;
598}
599
600/* Always inline for __builtin_return_address(0). */
601static inline __attribute__((always_inline))
602int rseq_mempool_range_destroy(struct rseq_mempool *pool,
603 struct rseq_mempool_range *range,
604 bool mapping_accessible)
605{
606 destroy_alloc_bitmap(pool, range);
607 if (!mapping_accessible) {
608 /*
609 * Only the header pages are populated in the child
610 * process.
611 */
612 return munmap(range->header, POOL_HEADER_NR_PAGES * rseq_get_page_len());
613 }
614 return munmap(range->mmap_addr, range->mmap_len);
615}
616
617/*
618 * Allocate a memory mapping aligned on @alignment, with an optional
619 * @pre_header before the mapping.
620 */
621static
622void *aligned_mmap_anonymous(size_t page_size, size_t len, size_t alignment,
623 void **pre_header, size_t pre_header_len)
624{
625 size_t minimum_page_count, page_count, extra, total_allocate = 0;
626 int page_order;
627 void *ptr;
628
629 if (len < page_size || alignment < page_size ||
630 !is_pow2(alignment) || (len & (alignment - 1))) {
631 errno = EINVAL;
632 return NULL;
633 }
634 page_order = rseq_get_count_order_ulong(page_size);
635 if (page_order < 0) {
636 errno = EINVAL;
637 return NULL;
638 }
639 if (pre_header_len && (pre_header_len & (page_size - 1))) {
640 errno = EINVAL;
641 return NULL;
642 }
643
644 minimum_page_count = (pre_header_len + len) >> page_order;
645 page_count = (pre_header_len + len + alignment - page_size) >> page_order;
646
647 assert(page_count >= minimum_page_count);
648
649 ptr = mmap(NULL, page_count << page_order, PROT_READ | PROT_WRITE,
650 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
651 if (ptr == MAP_FAILED) {
652 ptr = NULL;
653 goto alloc_error;
654 }
655
656 total_allocate = page_count << page_order;
657
658 if (!(((uintptr_t) ptr + pre_header_len) & (alignment - 1))) {
659 /* Pointer is already aligned. ptr points to pre_header. */
660 goto out;
661 }
662
663 /* Unmap extra before. */
664 extra = offset_align((uintptr_t) ptr + pre_header_len, alignment);
665 assert(!(extra & (page_size - 1)));
666 if (munmap(ptr, extra)) {
667 perror("munmap");
668 abort();
669 }
670 total_allocate -= extra;
671 ptr += extra; /* ptr points to pre_header */
672 page_count -= extra >> page_order;
673out:
674 assert(page_count >= minimum_page_count);
675
676 if (page_count > minimum_page_count) {
677 void *extra_ptr;
678
679 /* Unmap extra after. */
680 extra_ptr = ptr + (minimum_page_count << page_order);
681 extra = (page_count - minimum_page_count) << page_order;
682 if (munmap(extra_ptr, extra)) {
683 perror("munmap");
684 abort();
685 }
686 total_allocate -= extra;
687 }
688
689 assert(!(((uintptr_t)ptr + pre_header_len) & (alignment - 1)));
690 assert(total_allocate == len + pre_header_len);
691
692alloc_error:
693 if (ptr) {
694 if (pre_header)
695 *pre_header = ptr;
696 ptr += pre_header_len;
697 }
698 return ptr;
699}
700
701static
702int rseq_memfd_create_init(const char *poolname, size_t init_len)
703{
704 int fd;
705 char buf[249]; /* Limit is 249 bytes. */
706 const char *name;
707
708 if (poolname) {
709 snprintf(buf, sizeof(buf), "%s:rseq-mempool", poolname);
710 name = buf;
711 } else {
712 name = "<anonymous>:rseq-mempool";
713 }
714
715 fd = memfd_create(name, MFD_CLOEXEC);
716 if (fd < 0) {
717 perror("memfd_create");
718 goto end;
719 }
720 if (ftruncate(fd, (off_t) init_len)) {
721 if (close(fd))
722 perror("close");
723 fd = -1;
724 goto end;
725 }
726end:
727 return fd;
728}
729
730static
731void rseq_memfd_close(int fd)
732{
733 if (fd < 0)
734 return;
735 if (close(fd))
736 perror("close");
737}
738
739static
740struct rseq_mempool_range *rseq_mempool_range_create(struct rseq_mempool *pool)
741{
742 struct rseq_mempool_range *range;
743 unsigned long page_size;
744 void *header;
745 void *base;
746 size_t range_len; /* Range len excludes header. */
747 size_t header_len;
748 int memfd = -1;
749
750 if (pool->attr.max_nr_ranges &&
751 pool->nr_ranges >= pool->attr.max_nr_ranges) {
752 errno = ENOMEM;
753 return NULL;
754 }
755 page_size = rseq_get_page_len();
756
757 header_len = POOL_HEADER_NR_PAGES * page_size;
758 range_len = pool->attr.stride * pool->attr.max_nr_cpus;
759 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT)
760 range_len += pool->attr.stride; /* init values */
761 if (pool->attr.robust_set)
762 range_len += pool->attr.stride; /* dedicated free list */
763 base = aligned_mmap_anonymous(page_size, range_len,
764 pool->attr.stride, &header, header_len);
765 if (!base)
766 return NULL;
767 range = (struct rseq_mempool_range *) (base - RANGE_HEADER_OFFSET);
768 range->pool = pool;
769 range->header = header;
770 range->base = base;
771 range->mmap_addr = header;
772 range->mmap_len = header_len + range_len;
773
774 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT) {
775 range->init = base + (pool->attr.stride * pool->attr.max_nr_cpus);
776 /* Populate init values pages from memfd */
777 memfd = rseq_memfd_create_init(pool->name, pool->attr.stride);
778 if (memfd < 0)
779 goto error_alloc;
780 if (mmap(range->init, pool->attr.stride, PROT_READ | PROT_WRITE,
781 MAP_SHARED | MAP_FIXED, memfd, 0) != (void *) range->init)
782 goto error_alloc;
783 assert(pool->attr.type == MEMPOOL_TYPE_PERCPU);
784 /*
785 * Map per-cpu memory as private COW mappings of init values.
786 */
787 {
788 int cpu;
789
790 for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) {
791 void *p = base + (pool->attr.stride * cpu);
792 size_t len = pool->attr.stride;
793
794 if (mmap(p, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED,
795 memfd, 0) != (void *) p)
796 goto error_alloc;
797 }
798 }
799 /*
800 * The init values shared mapping should not be shared
801 * with the children processes across fork. Prevent the
802 * whole mapping from being used across fork.
803 */
804 if (madvise(base, range_len, MADV_DONTFORK))
805 goto error_alloc;
806
807 /*
808 * Write 0x1 in first byte of header first page, which
809 * will be WIPEONFORK (and thus cleared) in children
810 * processes. Used to find out if pool destroy is called
811 * from a child process after fork.
812 */
813 *((char *) header) = 0x1;
814 if (madvise(header, page_size, MADV_WIPEONFORK))
815 goto error_alloc;
816
817 /*
818 * The second header page contains the struct
819 * rseq_mempool_range, which is needed by pool destroy.
820 * Leave this anonymous page populated (COW) in child
821 * processes.
822 */
823 rseq_memfd_close(memfd);
824 memfd = -1;
825 }
826
827 if (pool->attr.robust_set) {
828 if (create_alloc_bitmap(pool, range))
829 goto error_alloc;
830 }
831 if (pool->attr.init_set) {
832 switch (pool->attr.type) {
833 case MEMPOOL_TYPE_GLOBAL:
834 if (pool->attr.init_func(pool->attr.init_priv,
835 base, pool->attr.stride, -1)) {
836 goto error_alloc;
837 }
838 break;
839 case MEMPOOL_TYPE_PERCPU:
840 {
841 int cpu;
842 for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) {
843 if (pool->attr.init_func(pool->attr.init_priv,
844 base + (pool->attr.stride * cpu),
845 pool->attr.stride, cpu)) {
846 goto error_alloc;
847 }
848 }
849 break;
850 }
851 default:
852 abort();
853 }
854 }
855 pool->nr_ranges++;
856 return range;
857
858error_alloc:
859 rseq_memfd_close(memfd);
860 (void) rseq_mempool_range_destroy(pool, range, true);
861 return NULL;
862}
863
864static
865bool pool_mappings_accessible(struct rseq_mempool *pool)
866{
867 struct rseq_mempool_range *range;
868 size_t page_size;
869 char *addr;
870
871 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_COW_INIT)
872 return true;
873 if (list_empty(&pool->range_list))
874 return true;
875 range = list_first_entry(&pool->range_list, struct rseq_mempool_range, node);
876 page_size = rseq_get_page_len();
877 /*
878 * Header first page is one page before the page containing the
879 * range structure.
880 */
881 addr = (char *) ((uintptr_t) range & ~(page_size - 1)) - page_size;
882 /*
883 * Look for 0x1 first byte marker in header first page.
884 */
885 if (*addr != 0x1)
886 return false;
887 return true;
888}
889
890int rseq_mempool_destroy(struct rseq_mempool *pool)
891{
892 struct rseq_mempool_range *range, *tmp_range;
893 bool mapping_accessible;
894 int ret = 0;
895
896 if (!pool)
897 return 0;
898
899 /*
900 * Validate that the pool mappings are accessible before doing
901 * free list/poison validation and unmapping ranges. This allows
902 * calling pool destroy in child process after a fork for COW_INIT
903 * pools to free pool resources.
904 */
905 mapping_accessible = pool_mappings_accessible(pool);
906
907 check_free_list(pool, mapping_accessible);
908 check_pool_poison(pool, mapping_accessible);
909
910 /* Iteration safe against removal. */
911 list_for_each_entry_safe(range, tmp_range, &pool->range_list, node) {
912 list_del(&range->node);
913 if (rseq_mempool_range_destroy(pool, range, mapping_accessible)) {
914 /* Keep list coherent in case of partial failure. */
915 list_add(&range->node, &pool->range_list);
916 goto end;
917 }
918 }
919 pthread_mutex_destroy(&pool->lock);
920 free(pool->name);
921 free(pool);
922end:
923 return ret;
924}
925
926struct rseq_mempool *rseq_mempool_create(const char *pool_name,
927 size_t item_len, const struct rseq_mempool_attr *_attr)
928{
929 struct rseq_mempool_attr attr = {};
930 struct rseq_mempool_range *range;
931 struct rseq_mempool *pool;
932 int order;
933
934 /* Make sure each item is large enough to contain free list pointers. */
935 if (item_len < sizeof(void *))
936 item_len = sizeof(void *);
937
938 /* Align item_len on next power of two. */
939 order = rseq_get_count_order_ulong(item_len);
940 if (order < 0) {
941 errno = EINVAL;
942 return NULL;
943 }
944 item_len = 1UL << order;
945
946 if (_attr)
947 memcpy(&attr, _attr, sizeof(attr));
948
949 /*
950 * Validate that the pool populate policy requested is known.
951 */
952 switch (attr.populate_policy) {
953 case RSEQ_MEMPOOL_POPULATE_COW_INIT:
954 break;
955 case RSEQ_MEMPOOL_POPULATE_COW_ZERO:
956 break;
957 default:
958 errno = EINVAL;
959 return NULL;
960 }
961
962 switch (attr.type) {
963 case MEMPOOL_TYPE_PERCPU:
964 if (attr.max_nr_cpus < 0) {
965 errno = EINVAL;
966 return NULL;
967 }
968 if (attr.max_nr_cpus == 0) {
969 /* Auto-detect */
970 attr.max_nr_cpus = rseq_get_max_nr_cpus();
971 if (attr.max_nr_cpus == 0) {
972 errno = EINVAL;
973 return NULL;
974 }
975 }
976 break;
977 case MEMPOOL_TYPE_GLOBAL:
978 /* Override populate policy for global type. */
979 if (attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT)
980 attr.populate_policy = RSEQ_MEMPOOL_POPULATE_COW_ZERO;
981 /* Use a 1-cpu pool for global mempool type. */
982 attr.max_nr_cpus = 1;
983 break;
984 }
985 if (!attr.stride)
986 attr.stride = RSEQ_MEMPOOL_STRIDE; /* Use default */
987 if (attr.robust_set && !attr.poison_set) {
988 attr.poison_set = true;
989 if (attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT)
990 attr.poison = DEFAULT_COW_INIT_POISON_VALUE;
991 else
992 attr.poison = DEFAULT_COW_ZERO_POISON_VALUE;
993 }
994 if (item_len > attr.stride || attr.stride < (size_t) rseq_get_page_len() ||
995 !is_pow2(attr.stride)) {
996 errno = EINVAL;
997 return NULL;
998 }
999
1000 pool = calloc(1, sizeof(struct rseq_mempool));
1001 if (!pool)
1002 return NULL;
1003
1004 memcpy(&pool->attr, &attr, sizeof(attr));
1005 pthread_mutex_init(&pool->lock, NULL);
1006 pool->item_len = item_len;
1007 pool->item_order = order;
1008 INIT_LIST_HEAD(&pool->range_list);
1009
1010 range = rseq_mempool_range_create(pool);
1011 if (!range)
1012 goto error_alloc;
1013 list_add(&range->node, &pool->range_list);
1014
1015 if (pool_name) {
1016 pool->name = strdup(pool_name);
1017 if (!pool->name)
1018 goto error_alloc;
1019 }
1020 return pool;
1021
1022error_alloc:
1023 rseq_mempool_destroy(pool);
1024 errno = ENOMEM;
1025 return NULL;
1026}
1027
1028/* Always inline for __builtin_return_address(0). */
1029static inline __attribute__((always_inline))
1030void set_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, size_t item_offset)
1031{
1032 unsigned long *bitmap = range->alloc_bitmap;
1033 size_t item_index = item_offset >> pool->item_order;
1034 unsigned long mask;
1035 size_t k;
1036
1037 if (!bitmap)
1038 return;
1039
1040 k = item_index / BIT_PER_ULONG;
1041 mask = 1ULL << (item_index % BIT_PER_ULONG);
1042
1043 /* Print error if bit is already set. */
1044 if (bitmap[k] & mask) {
1045 fprintf(stderr, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
1046 __func__, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0));
1047 abort();
1048 }
1049 bitmap[k] |= mask;
1050}
1051
1052static
1053void __rseq_percpu *__rseq_percpu_malloc(struct rseq_mempool *pool,
1054 bool zeroed, void *init_ptr, size_t init_len)
1055{
1056 struct rseq_mempool_range *range;
1057 struct free_list_node *node;
1058 uintptr_t item_offset;
1059 void __rseq_percpu *addr;
1060
1061 if (init_len > pool->item_len) {
1062 errno = EINVAL;
1063 return NULL;
1064 }
1065 pthread_mutex_lock(&pool->lock);
1066 /* Get first entry from free list. */
1067 node = pool->free_list_head;
1068 if (node != NULL) {
1069 void *range_base, *ptr;
1070
1071 ptr = __rseq_free_list_to_percpu_ptr(pool, node);
1072 range_base = (void *) ((uintptr_t) ptr & (~(pool->attr.stride - 1)));
1073 range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET);
1074 /* Remove node from free list (update head). */
1075 pool->free_list_head = node->next;
1076 item_offset = (uintptr_t) (ptr - range_base);
1077 rseq_percpu_check_poison_item(pool, range, item_offset);
1078 addr = __rseq_free_list_to_percpu_ptr(pool, node);
1079 goto end;
1080 }
1081 /*
1082 * If there are no ranges, or if the most recent range (first in
1083 * list) does not have any room left, create a new range and
1084 * prepend it to the list head.
1085 */
1086 if (list_empty(&pool->range_list))
1087 goto create_range;
1088 range = list_first_entry(&pool->range_list, struct rseq_mempool_range, node);
1089 if (range->next_unused + pool->item_len > pool->attr.stride)
1090 goto create_range;
1091 else
1092 goto room_left;
1093create_range:
1094 range = rseq_mempool_range_create(pool);
1095 if (!range) {
1096 errno = ENOMEM;
1097 addr = NULL;
1098 goto end;
1099 }
1100 /* Add range to head of list. */
1101 list_add(&range->node, &pool->range_list);
1102room_left:
1103 /* First range in list has room left. */
1104 item_offset = range->next_unused;
1105 addr = (void __rseq_percpu *) (range->base + item_offset);
1106 range->next_unused += pool->item_len;
1107end:
1108 if (addr)
1109 set_alloc_slot(pool, range, item_offset);
1110 pthread_mutex_unlock(&pool->lock);
1111 if (addr) {
1112 if (zeroed)
1113 rseq_percpu_zero_item(pool, range, item_offset);
1114 else if (init_ptr) {
1115 rseq_percpu_init_item(pool, range, item_offset,
1116 init_ptr, init_len);
1117 }
1118 }
1119 return addr;
1120}
1121
1122void __rseq_percpu *rseq_mempool_percpu_malloc(struct rseq_mempool *pool)
1123{
1124 return __rseq_percpu_malloc(pool, false, NULL, 0);
1125}
1126
1127void __rseq_percpu *rseq_mempool_percpu_zmalloc(struct rseq_mempool *pool)
1128{
1129 return __rseq_percpu_malloc(pool, true, NULL, 0);
1130}
1131
1132void __rseq_percpu *rseq_mempool_percpu_malloc_init(struct rseq_mempool *pool,
1133 void *init_ptr, size_t len)
1134{
1135 return __rseq_percpu_malloc(pool, false, init_ptr, len);
1136}
1137
1138/* Always inline for __builtin_return_address(0). */
1139static inline __attribute__((always_inline))
1140void clear_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, size_t item_offset)
1141{
1142 unsigned long *bitmap = range->alloc_bitmap;
1143 size_t item_index = item_offset >> pool->item_order;
1144 unsigned long mask;
1145 size_t k;
1146
1147 if (!bitmap)
1148 return;
1149
1150 k = item_index / BIT_PER_ULONG;
1151 mask = 1ULL << (item_index % BIT_PER_ULONG);
1152
1153 /* Print error if bit is not set. */
1154 if (!(bitmap[k] & mask)) {
1155 fprintf(stderr, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
1156 __func__, get_pool_name(pool), pool, item_offset,
1157 (void *) __builtin_return_address(0));
1158 abort();
1159 }
1160 bitmap[k] &= ~mask;
1161}
1162
1163void librseq_mempool_percpu_free(void __rseq_percpu *_ptr, size_t stride)
1164{
1165 uintptr_t ptr = (uintptr_t) _ptr;
1166 void *range_base = (void *) (ptr & (~(stride - 1)));
1167 struct rseq_mempool_range *range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET);
1168 struct rseq_mempool *pool = range->pool;
1169 uintptr_t item_offset = ptr & (stride - 1);
1170 struct free_list_node *head, *item;
1171
1172 pthread_mutex_lock(&pool->lock);
1173 clear_alloc_slot(pool, range, item_offset);
1174 /* Add ptr to head of free list */
1175 head = pool->free_list_head;
1176 if (pool->attr.poison_set)
1177 rseq_percpu_poison_item(pool, range, item_offset);
1178 item = __rseq_percpu_to_free_list_ptr(pool, _ptr);
1179 /*
1180 * Setting the next pointer will overwrite the first uintptr_t
1181 * poison for either CPU 0 (COW_ZERO, non-robust), or init data
1182 * (COW_INIT, non-robust).
1183 */
1184 item->next = head;
1185 pool->free_list_head = item;
1186 pthread_mutex_unlock(&pool->lock);
1187}
1188
1189struct rseq_mempool_set *rseq_mempool_set_create(void)
1190{
1191 struct rseq_mempool_set *pool_set;
1192
1193 pool_set = calloc(1, sizeof(struct rseq_mempool_set));
1194 if (!pool_set)
1195 return NULL;
1196 pthread_mutex_init(&pool_set->lock, NULL);
1197 return pool_set;
1198}
1199
1200int rseq_mempool_set_destroy(struct rseq_mempool_set *pool_set)
1201{
1202 int order, ret;
1203
1204 for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) {
1205 struct rseq_mempool *pool = pool_set->entries[order];
1206
1207 if (!pool)
1208 continue;
1209 ret = rseq_mempool_destroy(pool);
1210 if (ret)
1211 return ret;
1212 pool_set->entries[order] = NULL;
1213 }
1214 pthread_mutex_destroy(&pool_set->lock);
1215 free(pool_set);
1216 return 0;
1217}
1218
1219/* Ownership of pool is handed over to pool set on success. */
1220int rseq_mempool_set_add_pool(struct rseq_mempool_set *pool_set, struct rseq_mempool *pool)
1221{
1222 size_t item_order = pool->item_order;
1223 int ret = 0;
1224
1225 pthread_mutex_lock(&pool_set->lock);
1226 if (pool_set->entries[item_order]) {
1227 errno = EBUSY;
1228 ret = -1;
1229 goto end;
1230 }
1231 pool_set->entries[pool->item_order] = pool;
1232end:
1233 pthread_mutex_unlock(&pool_set->lock);
1234 return ret;
1235}
1236
1237static
1238void __rseq_percpu *__rseq_mempool_set_malloc(struct rseq_mempool_set *pool_set,
1239 void *init_ptr, size_t len, bool zeroed)
1240{
1241 int order, min_order = POOL_SET_MIN_ENTRY;
1242 struct rseq_mempool *pool;
1243 void __rseq_percpu *addr;
1244
1245 order = rseq_get_count_order_ulong(len);
1246 if (order > POOL_SET_MIN_ENTRY)
1247 min_order = order;
1248again:
1249 pthread_mutex_lock(&pool_set->lock);
1250 /* First smallest present pool where @len fits. */
1251 for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) {
1252 pool = pool_set->entries[order];
1253
1254 if (!pool)
1255 continue;
1256 if (pool->item_len >= len)
1257 goto found;
1258 }
1259 pool = NULL;
1260found:
1261 pthread_mutex_unlock(&pool_set->lock);
1262 if (pool) {
1263 addr = __rseq_percpu_malloc(pool, zeroed, init_ptr, len);
1264 if (addr == NULL && errno == ENOMEM) {
1265 /*
1266 * If the allocation failed, try again with a
1267 * larger pool.
1268 */
1269 min_order = order + 1;
1270 goto again;
1271 }
1272 } else {
1273 /* Not found. */
1274 errno = ENOMEM;
1275 addr = NULL;
1276 }
1277 return addr;
1278}
1279
1280void __rseq_percpu *rseq_mempool_set_percpu_malloc(struct rseq_mempool_set *pool_set, size_t len)
1281{
1282 return __rseq_mempool_set_malloc(pool_set, NULL, len, false);
1283}
1284
1285void __rseq_percpu *rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set *pool_set, size_t len)
1286{
1287 return __rseq_mempool_set_malloc(pool_set, NULL, len, true);
1288}
1289
1290void __rseq_percpu *rseq_mempool_set_percpu_malloc_init(struct rseq_mempool_set *pool_set,
1291 void *init_ptr, size_t len)
1292{
1293 return __rseq_mempool_set_malloc(pool_set, init_ptr, len, true);
1294}
1295
1296struct rseq_mempool_attr *rseq_mempool_attr_create(void)
1297{
1298 return calloc(1, sizeof(struct rseq_mempool_attr));
1299}
1300
1301void rseq_mempool_attr_destroy(struct rseq_mempool_attr *attr)
1302{
1303 free(attr);
1304}
1305
1306int rseq_mempool_attr_set_init(struct rseq_mempool_attr *attr,
1307 int (*init_func)(void *priv, void *addr, size_t len, int cpu),
1308 void *init_priv)
1309{
1310 if (!attr) {
1311 errno = EINVAL;
1312 return -1;
1313 }
1314 attr->init_set = true;
1315 attr->init_func = init_func;
1316 attr->init_priv = init_priv;
1317 attr->populate_policy = RSEQ_MEMPOOL_POPULATE_COW_INIT;
1318 return 0;
1319}
1320
1321int rseq_mempool_attr_set_robust(struct rseq_mempool_attr *attr)
1322{
1323 if (!attr) {
1324 errno = EINVAL;
1325 return -1;
1326 }
1327 attr->robust_set = true;
1328 return 0;
1329}
1330
1331int rseq_mempool_attr_set_percpu(struct rseq_mempool_attr *attr,
1332 size_t stride, int max_nr_cpus)
1333{
1334 if (!attr) {
1335 errno = EINVAL;
1336 return -1;
1337 }
1338 attr->type = MEMPOOL_TYPE_PERCPU;
1339 attr->stride = stride;
1340 attr->max_nr_cpus = max_nr_cpus;
1341 return 0;
1342}
1343
1344int rseq_mempool_attr_set_global(struct rseq_mempool_attr *attr,
1345 size_t stride)
1346{
1347 if (!attr) {
1348 errno = EINVAL;
1349 return -1;
1350 }
1351 attr->type = MEMPOOL_TYPE_GLOBAL;
1352 attr->stride = stride;
1353 attr->max_nr_cpus = 0;
1354 return 0;
1355}
1356
1357int rseq_mempool_attr_set_max_nr_ranges(struct rseq_mempool_attr *attr,
1358 unsigned long max_nr_ranges)
1359{
1360 if (!attr) {
1361 errno = EINVAL;
1362 return -1;
1363 }
1364 attr->max_nr_ranges = max_nr_ranges;
1365 return 0;
1366}
1367
1368int rseq_mempool_attr_set_poison(struct rseq_mempool_attr *attr,
1369 uintptr_t poison)
1370{
1371 if (!attr) {
1372 errno = EINVAL;
1373 return -1;
1374 }
1375 attr->poison_set = true;
1376 attr->poison = poison;
1377 return 0;
1378}
1379
1380int rseq_mempool_attr_set_populate_policy(struct rseq_mempool_attr *attr,
1381 enum rseq_mempool_populate_policy policy)
1382{
1383 if (!attr) {
1384 errno = EINVAL;
1385 return -1;
1386 }
1387 attr->populate_policy = policy;
1388 return 0;
1389}
1390
1391int rseq_mempool_get_max_nr_cpus(struct rseq_mempool *mempool)
1392{
1393 if (!mempool || mempool->attr.type != MEMPOOL_TYPE_PERCPU) {
1394 errno = EINVAL;
1395 return -1;
1396 }
1397 return mempool->attr.max_nr_cpus;
1398}
This page took 0.027019 seconds and 5 git commands to generate.