Move robust pool free list to own memory area
[librseq.git] / src / rseq-mempool.c
1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3
4 #include <rseq/mempool.h>
5 #include <sys/mman.h>
6 #include <assert.h>
7 #include <string.h>
8 #include <pthread.h>
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <rseq/compiler.h>
12 #include <errno.h>
13 #include <stdint.h>
14 #include <stdbool.h>
15 #include <stdio.h>
16 #include <fcntl.h>
17
18 #ifdef HAVE_LIBNUMA
19 # include <numa.h>
20 # include <numaif.h>
21 #endif
22
23 #include "rseq-utils.h"
24 #include <rseq/rseq.h>
25
26 /*
27 * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator.
28 *
29 * The rseq per-CPU memory allocator allows the application the request
30 * memory pools of CPU-Local memory each of containing objects of a
31 * given size (rounded to next power of 2), reserving a given virtual
32 * address size per CPU, for a given maximum number of CPUs.
33 *
34 * The per-CPU memory allocator is analogous to TLS (Thread-Local
35 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
36 * memory allocator provides CPU-Local Storage.
37 */
38
39 #define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG
40
41 /*
42 * Smallest allocation should hold enough space for a free list pointer.
43 */
44 #if RSEQ_BITS_PER_LONG == 64
45 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
46 #else
47 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
48 #endif
49
50 #define BIT_PER_ULONG (8 * sizeof(unsigned long))
51
52 #define MOVE_PAGES_BATCH_SIZE 4096
53
54 #define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range)
55
56 #if RSEQ_BITS_PER_LONG == 64
57 # define DEFAULT_POISON_VALUE 0x5555555555555555ULL
58 #else
59 # define DEFAULT_POISON_VALUE 0x55555555UL
60 #endif
61
62 struct free_list_node;
63
64 struct free_list_node {
65 struct free_list_node *next;
66 };
67
68 enum mempool_type {
69 MEMPOOL_TYPE_GLOBAL = 0, /* Default */
70 MEMPOOL_TYPE_PERCPU = 1,
71 };
72
73 struct rseq_mempool_attr {
74 bool mmap_set;
75 void *(*mmap_func)(void *priv, size_t len);
76 int (*munmap_func)(void *priv, void *ptr, size_t len);
77 void *mmap_priv;
78
79 bool init_set;
80 int (*init_func)(void *priv, void *addr, size_t len, int cpu);
81 void *init_priv;
82
83 bool robust_set;
84
85 enum mempool_type type;
86 size_t stride;
87 int max_nr_cpus;
88
89 unsigned long max_nr_ranges;
90
91 bool poison_set;
92 uintptr_t poison;
93
94 enum rseq_mempool_populate_policy populate_policy;
95 };
96
97 struct rseq_mempool_range;
98
99 struct rseq_mempool_range {
100 struct rseq_mempool_range *next; /* Linked list of ranges. */
101 struct rseq_mempool *pool; /* Backward reference to container pool. */
102
103 /*
104 * Memory layout of a mempool range:
105 * - Header page (contains struct rseq_mempool_range at the very end),
106 * - Base of the per-cpu data, starting with CPU 0.
107 * Aliases with free-list for non-robust populate all pool.
108 * - CPU 1,
109 * ...
110 * - CPU max_nr_cpus - 1
111 * - init values (unpopulated for RSEQ_MEMPOOL_POPULATE_ALL).
112 * Aliases with free-list for non-robust populate none pool.
113 * - free list (for robust pool).
114 *
115 * The free list aliases the CPU 0 memory area for non-robust
116 * populate all pools. It aliases with init values for
117 * non-robust populate none pools. It is located immediately
118 * after the init values for robust pools.
119 */
120 void *header;
121 void *base;
122 /*
123 * The init values contains malloc_init/zmalloc values.
124 * Pointer is NULL for RSEQ_MEMPOOL_POPULATE_ALL.
125 */
126 void *init;
127 size_t next_unused;
128
129 /* Pool range mmap/munmap */
130 void *mmap_addr;
131 size_t mmap_len;
132
133 /* Track alloc/free. */
134 unsigned long *alloc_bitmap;
135 };
136
137 struct rseq_mempool {
138 /* Head of ranges linked-list. */
139 struct rseq_mempool_range *range_list;
140 unsigned long nr_ranges;
141
142 size_t item_len;
143 int item_order;
144
145 /*
146 * The free list chains freed items on the CPU 0 address range.
147 * We should rethink this decision if false sharing between
148 * malloc/free from other CPUs and data accesses from CPU 0
149 * becomes an issue. This is a NULL-terminated singly-linked
150 * list.
151 */
152 struct free_list_node *free_list_head;
153
154 /* This lock protects allocation/free within the pool. */
155 pthread_mutex_t lock;
156
157 struct rseq_mempool_attr attr;
158 char *name;
159 };
160
161 /*
162 * Pool set entries are indexed by item_len rounded to the next power of
163 * 2. A pool set can contain NULL pool entries, in which case the next
164 * large enough entry will be used for allocation.
165 */
166 struct rseq_mempool_set {
167 /* This lock protects add vs malloc/zmalloc within the pool set. */
168 pthread_mutex_t lock;
169 struct rseq_mempool *entries[POOL_SET_NR_ENTRIES];
170 };
171
172 /*
173 * This memfd is used to implement the user COW behavior for the page
174 * protection scheme. memfd is a sparse virtual file. Its layout (in
175 * offset from beginning of file) matches the process address space
176 * (pointers directly converted to file offsets).
177 */
178 struct rseq_memfd {
179 pthread_mutex_t lock;
180 size_t reserved_size;
181 unsigned int refcount;
182 int fd;
183 };
184
185 static struct rseq_memfd memfd = {
186 .lock = PTHREAD_MUTEX_INITIALIZER,
187 .reserved_size = 0,
188 .refcount = 0,
189 .fd = -1,
190 };
191
192 static
193 const char *get_pool_name(const struct rseq_mempool *pool)
194 {
195 return pool->name ? : "<anonymous>";
196 }
197
198 static
199 void *__rseq_pool_range_percpu_ptr(const struct rseq_mempool_range *range, int cpu,
200 uintptr_t item_offset, size_t stride)
201 {
202 return range->base + (stride * cpu) + item_offset;
203 }
204
205 static
206 void *__rseq_pool_range_init_ptr(const struct rseq_mempool_range *range,
207 uintptr_t item_offset)
208 {
209 if (!range->init)
210 return NULL;
211 return range->init + item_offset;
212 }
213
214 static
215 void __rseq_percpu *__rseq_free_list_to_percpu_ptr(const struct rseq_mempool *pool,
216 struct free_list_node *node)
217 {
218 void __rseq_percpu *p = (void __rseq_percpu *) node;
219
220 if (pool->attr.robust_set) {
221 /* Skip cpus. */
222 p -= pool->attr.max_nr_cpus * pool->attr.stride;
223 /* Skip init values */
224 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL)
225 p -= pool->attr.stride;
226
227 } else {
228 /* Populate none free list is in init values */
229 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL)
230 p -= pool->attr.max_nr_cpus * pool->attr.stride;
231 }
232 return p;
233 }
234
235 static
236 struct free_list_node *__rseq_percpu_to_free_list_ptr(const struct rseq_mempool *pool,
237 void __rseq_percpu *p)
238 {
239 if (pool->attr.robust_set) {
240 /* Skip cpus. */
241 p += pool->attr.max_nr_cpus * pool->attr.stride;
242 /* Skip init values */
243 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL)
244 p += pool->attr.stride;
245
246 } else {
247 /* Populate none free list is in init values */
248 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL)
249 p += pool->attr.max_nr_cpus * pool->attr.stride;
250 }
251 return (struct free_list_node *) p;
252 }
253
254 static
255 off_t ptr_to_off_t(void *p)
256 {
257 return (off_t) (uintptr_t) p;
258 }
259
260 static
261 int memcmpbyte(const char *s, int c, size_t n)
262 {
263 int res = 0;
264
265 while (n-- > 0)
266 if ((res = *(s++) - c) != 0)
267 break;
268 return res;
269 }
270
271 static
272 void rseq_percpu_zero_item(struct rseq_mempool *pool,
273 struct rseq_mempool_range *range, uintptr_t item_offset)
274 {
275 char *init_p = NULL;
276 int i;
277
278 init_p = __rseq_pool_range_init_ptr(range, item_offset);
279 if (init_p)
280 memset(init_p, 0, pool->item_len);
281 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
282 char *p = __rseq_pool_range_percpu_ptr(range, i,
283 item_offset, pool->attr.stride);
284
285 /* Update propagated */
286 if (init_p && !memcmpbyte(p, 0, pool->item_len))
287 continue;
288 memset(p, 0, pool->item_len);
289 }
290 }
291
292 static
293 void rseq_percpu_init_item(struct rseq_mempool *pool,
294 struct rseq_mempool_range *range, uintptr_t item_offset,
295 void *init_ptr, size_t init_len)
296 {
297 char *init_p = NULL;
298 int i;
299
300 init_p = __rseq_pool_range_init_ptr(range, item_offset);
301 if (init_p)
302 memcpy(init_p, init_ptr, init_len);
303 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
304 char *p = __rseq_pool_range_percpu_ptr(range, i,
305 item_offset, pool->attr.stride);
306
307 /* Update propagated */
308 if (init_p && !memcmp(init_p, p, init_len))
309 continue;
310 memcpy(p, init_ptr, init_len);
311 }
312 }
313
314 static
315 void rseq_poison_item(void *p, size_t item_len, uintptr_t poison)
316 {
317 size_t offset;
318
319 for (offset = 0; offset < item_len; offset += sizeof(uintptr_t))
320 *((uintptr_t *) (p + offset)) = poison;
321 }
322
323 static
324 void rseq_percpu_poison_item(struct rseq_mempool *pool,
325 struct rseq_mempool_range *range, uintptr_t item_offset)
326 {
327 uintptr_t poison = pool->attr.poison;
328 char *init_p = NULL;
329 int i;
330
331 init_p = __rseq_pool_range_init_ptr(range, item_offset);
332 if (init_p)
333 rseq_poison_item(init_p, pool->item_len, poison);
334 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
335 char *p = __rseq_pool_range_percpu_ptr(range, i,
336 item_offset, pool->attr.stride);
337
338 /* Update propagated */
339 if (init_p && !memcmp(init_p, p, pool->item_len))
340 continue;
341 rseq_poison_item(p, pool->item_len, poison);
342 }
343 }
344
345 /* Always inline for __builtin_return_address(0). */
346 static inline __attribute__((always_inline))
347 void rseq_check_poison_item(const struct rseq_mempool *pool, uintptr_t item_offset,
348 void *p, size_t item_len, uintptr_t poison)
349 {
350 size_t offset;
351
352 for (offset = 0; offset < item_len; offset += sizeof(uintptr_t)) {
353 uintptr_t v;
354
355 v = *((uintptr_t *) (p + offset));
356 if (v != poison) {
357 fprintf(stderr, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
358 __func__, (unsigned long) v, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0));
359 abort();
360 }
361 }
362 }
363
364 /* Always inline for __builtin_return_address(0). */
365 static inline __attribute__((always_inline))
366 void rseq_percpu_check_poison_item(const struct rseq_mempool *pool,
367 const struct rseq_mempool_range *range, uintptr_t item_offset)
368 {
369 uintptr_t poison = pool->attr.poison;
370 char *init_p;
371 int i;
372
373 if (!pool->attr.robust_set)
374 return;
375 init_p = __rseq_pool_range_init_ptr(range, item_offset);
376 if (init_p)
377 rseq_check_poison_item(pool, item_offset, init_p, pool->item_len, poison);
378 for (i = 0; i < pool->attr.max_nr_cpus; i++) {
379 char *p = __rseq_pool_range_percpu_ptr(range, i,
380 item_offset, pool->attr.stride);
381 rseq_check_poison_item(pool, item_offset, p, pool->item_len, poison);
382 }
383 }
384
385 #ifdef HAVE_LIBNUMA
386 int rseq_mempool_range_init_numa(void *addr, size_t len, int cpu, int numa_flags)
387 {
388 unsigned long nr_pages, page_len;
389 int status[MOVE_PAGES_BATCH_SIZE];
390 int nodes[MOVE_PAGES_BATCH_SIZE];
391 void *pages[MOVE_PAGES_BATCH_SIZE];
392 long ret;
393
394 if (!numa_flags) {
395 errno = EINVAL;
396 return -1;
397 }
398 page_len = rseq_get_page_len();
399 nr_pages = len >> rseq_get_count_order_ulong(page_len);
400
401 nodes[0] = numa_node_of_cpu(cpu);
402 if (nodes[0] < 0)
403 return -1;
404
405 for (size_t k = 1; k < RSEQ_ARRAY_SIZE(nodes); ++k) {
406 nodes[k] = nodes[0];
407 }
408
409 for (unsigned long page = 0; page < nr_pages;) {
410
411 size_t max_k = RSEQ_ARRAY_SIZE(pages);
412 size_t left = nr_pages - page;
413
414 if (left < max_k) {
415 max_k = left;
416 }
417
418 for (size_t k = 0; k < max_k; ++k, ++page) {
419 pages[k] = addr + (page * page_len);
420 status[k] = -EPERM;
421 }
422
423 ret = move_pages(0, max_k, pages, nodes, status, numa_flags);
424
425 if (ret < 0)
426 return ret;
427
428 if (ret > 0) {
429 fprintf(stderr, "%lu pages were not migrated\n", ret);
430 for (size_t k = 0; k < max_k; ++k) {
431 if (status[k] < 0)
432 fprintf(stderr,
433 "Error while moving page %p to numa node %d: %u\n",
434 pages[k], nodes[k], -status[k]);
435 }
436 }
437 }
438 return 0;
439 }
440 #else
441 int rseq_mempool_range_init_numa(void *addr __attribute__((unused)),
442 size_t len __attribute__((unused)),
443 int cpu __attribute__((unused)),
444 int numa_flags __attribute__((unused)))
445 {
446 errno = ENOSYS;
447 return -1;
448 }
449 #endif
450
451 static
452 void *default_mmap_func(void *priv __attribute__((unused)), size_t len)
453 {
454 void *base;
455
456 base = mmap(NULL, len, PROT_READ | PROT_WRITE,
457 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
458 if (base == MAP_FAILED)
459 return NULL;
460 return base;
461 }
462
463 static
464 int default_munmap_func(void *priv __attribute__((unused)), void *ptr, size_t len)
465 {
466 return munmap(ptr, len);
467 }
468
469 static
470 int create_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range)
471 {
472 size_t count;
473
474 count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
475
476 /*
477 * Not being able to create the validation bitmap is an error
478 * that needs to be reported.
479 */
480 range->alloc_bitmap = calloc(count, sizeof(unsigned long));
481 if (!range->alloc_bitmap)
482 return -1;
483 return 0;
484 }
485
486 static
487 bool percpu_addr_in_pool(const struct rseq_mempool *pool, void __rseq_percpu *_addr)
488 {
489 struct rseq_mempool_range *range;
490 void *addr = (void *) _addr;
491
492 for (range = pool->range_list; range; range = range->next) {
493 if (addr >= range->base && addr < range->base + range->next_unused)
494 return true;
495 }
496 return false;
497 }
498
499 /* Always inline for __builtin_return_address(0). */
500 static inline __attribute__((always_inline))
501 void check_free_list(const struct rseq_mempool *pool)
502 {
503 size_t total_item = 0, total_never_allocated = 0, total_freed = 0,
504 max_list_traversal = 0, traversal_iteration = 0;
505 struct rseq_mempool_range *range;
506
507 if (!pool->attr.robust_set)
508 return;
509
510 for (range = pool->range_list; range; range = range->next) {
511 total_item += pool->attr.stride >> pool->item_order;
512 total_never_allocated += (pool->attr.stride - range->next_unused) >> pool->item_order;
513 }
514 max_list_traversal = total_item - total_never_allocated;
515
516 for (struct free_list_node *node = pool->free_list_head, *prev = NULL;
517 node;
518 prev = node,
519 node = node->next) {
520
521 if (traversal_iteration >= max_list_traversal) {
522 fprintf(stderr, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n",
523 __func__, get_pool_name(pool), pool, __builtin_return_address(0));
524 abort();
525 }
526
527 /* Node is out of range. */
528 if (!percpu_addr_in_pool(pool, __rseq_free_list_to_percpu_ptr(pool, node))) {
529 if (prev)
530 fprintf(stderr, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
531 __func__, prev, node, get_pool_name(pool), pool, __builtin_return_address(0));
532 else
533 fprintf(stderr, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n",
534 __func__, node, get_pool_name(pool), pool, __builtin_return_address(0));
535 abort();
536 }
537
538 traversal_iteration++;
539 total_freed++;
540 }
541
542 if (total_never_allocated + total_freed != total_item) {
543 fprintf(stderr, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n",
544 __func__, get_pool_name(pool), pool, total_item, total_never_allocated, total_freed, __builtin_return_address(0));
545 abort();
546 }
547 }
548
549 /* Always inline for __builtin_return_address(0). */
550 static inline __attribute__((always_inline))
551 void check_range_poison(const struct rseq_mempool *pool,
552 const struct rseq_mempool_range *range)
553 {
554 size_t item_offset;
555
556 for (item_offset = 0; item_offset < range->next_unused;
557 item_offset += pool->item_len)
558 rseq_percpu_check_poison_item(pool, range, item_offset);
559 }
560
561 /* Always inline for __builtin_return_address(0). */
562 static inline __attribute__((always_inline))
563 void check_pool_poison(const struct rseq_mempool *pool)
564 {
565 struct rseq_mempool_range *range;
566
567 if (!pool->attr.robust_set)
568 return;
569 for (range = pool->range_list; range; range = range->next)
570 check_range_poison(pool, range);
571 }
572
573 /* Always inline for __builtin_return_address(0). */
574 static inline __attribute__((always_inline))
575 void destroy_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range)
576 {
577 unsigned long *bitmap = range->alloc_bitmap;
578 size_t count, total_leaks = 0;
579
580 if (!bitmap)
581 return;
582
583 count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG;
584
585 /* Assert that all items in the pool were freed. */
586 for (size_t k = 0; k < count; ++k)
587 total_leaks += rseq_hweight_ulong(bitmap[k]);
588 if (total_leaks) {
589 fprintf(stderr, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n",
590 __func__, get_pool_name(pool), pool, total_leaks, (void *) __builtin_return_address(0));
591 abort();
592 }
593
594 free(bitmap);
595 range->alloc_bitmap = NULL;
596 }
597
598 /* Always inline for __builtin_return_address(0). */
599 static inline __attribute__((always_inline))
600 int rseq_mempool_range_destroy(struct rseq_mempool *pool,
601 struct rseq_mempool_range *range)
602 {
603 int ret = 0;
604
605 destroy_alloc_bitmap(pool, range);
606
607 /*
608 * Punch a hole into memfd where the init values used to be.
609 */
610 if (range->init) {
611 ret = fallocate(memfd.fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
612 ptr_to_off_t(range->init), pool->attr.stride);
613 if (ret)
614 return ret;
615 range->init = NULL;
616 }
617
618 /* range is a header located one page before the aligned mapping. */
619 return pool->attr.munmap_func(pool->attr.mmap_priv, range->mmap_addr, range->mmap_len);
620 }
621
622 /*
623 * Allocate a memory mapping aligned on @alignment, with an optional
624 * @pre_header before the mapping.
625 */
626 static
627 void *aligned_mmap_anonymous(struct rseq_mempool *pool,
628 size_t page_size, size_t len, size_t alignment,
629 void **pre_header, size_t pre_header_len)
630 {
631 size_t minimum_page_count, page_count, extra, total_allocate = 0;
632 int page_order;
633 void *ptr;
634
635 if (len < page_size || alignment < page_size ||
636 !is_pow2(alignment) || (len & (alignment - 1))) {
637 errno = EINVAL;
638 return NULL;
639 }
640 page_order = rseq_get_count_order_ulong(page_size);
641 if (page_order < 0) {
642 errno = EINVAL;
643 return NULL;
644 }
645 if (pre_header_len && (pre_header_len & (page_size - 1))) {
646 errno = EINVAL;
647 return NULL;
648 }
649
650 minimum_page_count = (pre_header_len + len) >> page_order;
651 page_count = (pre_header_len + len + alignment - page_size) >> page_order;
652
653 assert(page_count >= minimum_page_count);
654
655 ptr = pool->attr.mmap_func(pool->attr.mmap_priv, page_count << page_order);
656 if (!ptr)
657 goto alloc_error;
658
659 total_allocate = page_count << page_order;
660
661 if (!(((uintptr_t) ptr + pre_header_len) & (alignment - 1))) {
662 /* Pointer is already aligned. ptr points to pre_header. */
663 goto out;
664 }
665
666 /* Unmap extra before. */
667 extra = offset_align((uintptr_t) ptr + pre_header_len, alignment);
668 assert(!(extra & (page_size - 1)));
669 if (pool->attr.munmap_func(pool->attr.mmap_priv, ptr, extra)) {
670 perror("munmap");
671 abort();
672 }
673 total_allocate -= extra;
674 ptr += extra; /* ptr points to pre_header */
675 page_count -= extra >> page_order;
676 out:
677 assert(page_count >= minimum_page_count);
678
679 if (page_count > minimum_page_count) {
680 void *extra_ptr;
681
682 /* Unmap extra after. */
683 extra_ptr = ptr + (minimum_page_count << page_order);
684 extra = (page_count - minimum_page_count) << page_order;
685 if (pool->attr.munmap_func(pool->attr.mmap_priv, extra_ptr, extra)) {
686 perror("munmap");
687 abort();
688 }
689 total_allocate -= extra;
690 }
691
692 assert(!(((uintptr_t)ptr + pre_header_len) & (alignment - 1)));
693 assert(total_allocate == len + pre_header_len);
694
695 alloc_error:
696 if (ptr) {
697 if (pre_header)
698 *pre_header = ptr;
699 ptr += pre_header_len;
700 }
701 return ptr;
702 }
703
704 static
705 int rseq_memfd_reserve_init(void *init, size_t init_len)
706 {
707 int ret = 0;
708 size_t reserve_len;
709
710 pthread_mutex_lock(&memfd.lock);
711 reserve_len = (size_t) ptr_to_off_t(init) + init_len;
712 if (reserve_len > memfd.reserved_size) {
713 if (ftruncate(memfd.fd, (off_t) reserve_len)) {
714 ret = -1;
715 goto unlock;
716 }
717 memfd.reserved_size = reserve_len;
718 }
719 unlock:
720 pthread_mutex_unlock(&memfd.lock);
721 return ret;
722 }
723
724 static
725 struct rseq_mempool_range *rseq_mempool_range_create(struct rseq_mempool *pool)
726 {
727 struct rseq_mempool_range *range;
728 unsigned long page_size;
729 void *header;
730 void *base;
731 size_t range_len; /* Range len excludes header. */
732
733 if (pool->attr.max_nr_ranges &&
734 pool->nr_ranges >= pool->attr.max_nr_ranges) {
735 errno = ENOMEM;
736 return NULL;
737 }
738 page_size = rseq_get_page_len();
739
740 range_len = pool->attr.stride * pool->attr.max_nr_cpus;
741 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL)
742 range_len += pool->attr.stride; /* init values */
743 if (pool->attr.robust_set)
744 range_len += pool->attr.stride; /* free list */
745 base = aligned_mmap_anonymous(pool, page_size,
746 range_len,
747 pool->attr.stride,
748 &header, page_size);
749 if (!base)
750 return NULL;
751 range = (struct rseq_mempool_range *) (base - RANGE_HEADER_OFFSET);
752 range->pool = pool;
753 range->header = header;
754 range->base = base;
755 range->mmap_addr = header;
756 range->mmap_len = page_size + range_len;
757
758 if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL) {
759 range->init = base + (pool->attr.stride * pool->attr.max_nr_cpus);
760 /* Populate init values pages from memfd */
761 if (rseq_memfd_reserve_init(range->init, pool->attr.stride))
762 goto error_alloc;
763 if (mmap(range->init, pool->attr.stride, PROT_READ | PROT_WRITE,
764 MAP_SHARED | MAP_FIXED, memfd.fd,
765 ptr_to_off_t(range->init)) != (void *) range->init) {
766 goto error_alloc;
767 }
768 assert(pool->attr.type == MEMPOOL_TYPE_PERCPU);
769 /*
770 * Map per-cpu memory as private COW mappings of init values.
771 */
772 {
773 int cpu;
774
775 for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) {
776 void *p = base + (pool->attr.stride * cpu);
777 size_t len = pool->attr.stride;
778
779 if (mmap(p, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED,
780 memfd.fd, ptr_to_off_t(range->init)) != (void *) p) {
781 goto error_alloc;
782 }
783 }
784 }
785 }
786
787 if (pool->attr.robust_set) {
788 if (create_alloc_bitmap(pool, range))
789 goto error_alloc;
790 }
791 if (pool->attr.init_set) {
792 switch (pool->attr.type) {
793 case MEMPOOL_TYPE_GLOBAL:
794 if (pool->attr.init_func(pool->attr.init_priv,
795 base, pool->attr.stride, -1)) {
796 goto error_alloc;
797 }
798 break;
799 case MEMPOOL_TYPE_PERCPU:
800 {
801 int cpu;
802 for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) {
803 if (pool->attr.init_func(pool->attr.init_priv,
804 base + (pool->attr.stride * cpu),
805 pool->attr.stride, cpu)) {
806 goto error_alloc;
807 }
808 }
809 break;
810 }
811 default:
812 abort();
813 }
814 }
815 pool->nr_ranges++;
816 return range;
817
818 error_alloc:
819 (void) rseq_mempool_range_destroy(pool, range);
820 return NULL;
821 }
822
823 static
824 int rseq_mempool_memfd_ref(struct rseq_mempool *pool)
825 {
826 int ret = 0;
827
828 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_ALL)
829 return 0;
830
831 pthread_mutex_lock(&memfd.lock);
832 if (memfd.refcount == 0) {
833 memfd.fd = memfd_create("mempool", MFD_CLOEXEC);
834 if (memfd.fd < 0) {
835 perror("memfd_create");
836 ret = -1;
837 goto unlock;
838 }
839 }
840 memfd.refcount++;
841 unlock:
842 pthread_mutex_unlock(&memfd.lock);
843 return ret;
844 }
845
846 static
847 void rseq_mempool_memfd_unref(struct rseq_mempool *pool)
848 {
849 if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_ALL)
850 return;
851
852 pthread_mutex_lock(&memfd.lock);
853 if (memfd.refcount == 1) {
854 if (close(memfd.fd)) {
855 perror("close");
856 abort();
857 }
858 memfd.fd = -1;
859 memfd.reserved_size = 0;
860 }
861 memfd.refcount--;
862 pthread_mutex_unlock(&memfd.lock);
863 }
864
865 int rseq_mempool_destroy(struct rseq_mempool *pool)
866 {
867 struct rseq_mempool_range *range, *next_range;
868 int ret = 0;
869
870 if (!pool)
871 return 0;
872 check_free_list(pool);
873 check_pool_poison(pool);
874 /* Iteration safe against removal. */
875 for (range = pool->range_list; range && (next_range = range->next, 1); range = next_range) {
876 if (rseq_mempool_range_destroy(pool, range))
877 goto end;
878 /* Update list head to keep list coherent in case of partial failure. */
879 pool->range_list = next_range;
880 }
881 rseq_mempool_memfd_unref(pool);
882 pthread_mutex_destroy(&pool->lock);
883 free(pool->name);
884 free(pool);
885 end:
886 return ret;
887 }
888
889 struct rseq_mempool *rseq_mempool_create(const char *pool_name,
890 size_t item_len, const struct rseq_mempool_attr *_attr)
891 {
892 struct rseq_mempool *pool;
893 struct rseq_mempool_attr attr = {};
894 int order;
895
896 /* Make sure each item is large enough to contain free list pointers. */
897 if (item_len < sizeof(void *))
898 item_len = sizeof(void *);
899
900 /* Align item_len on next power of two. */
901 order = rseq_get_count_order_ulong(item_len);
902 if (order < 0) {
903 errno = EINVAL;
904 return NULL;
905 }
906 item_len = 1UL << order;
907
908 if (_attr)
909 memcpy(&attr, _attr, sizeof(attr));
910 if (!attr.mmap_set) {
911 attr.mmap_func = default_mmap_func;
912 attr.munmap_func = default_munmap_func;
913 attr.mmap_priv = NULL;
914 }
915
916 switch (attr.type) {
917 case MEMPOOL_TYPE_PERCPU:
918 if (attr.max_nr_cpus < 0) {
919 errno = EINVAL;
920 return NULL;
921 }
922 if (attr.max_nr_cpus == 0) {
923 /* Auto-detect */
924 attr.max_nr_cpus = rseq_get_max_nr_cpus();
925 if (attr.max_nr_cpus == 0) {
926 errno = EINVAL;
927 return NULL;
928 }
929 }
930 break;
931 case MEMPOOL_TYPE_GLOBAL:
932 /* Override populate policy for global type. */
933 attr.populate_policy = RSEQ_MEMPOOL_POPULATE_ALL;
934 /* Use a 1-cpu pool for global mempool type. */
935 attr.max_nr_cpus = 1;
936 break;
937 }
938 if (!attr.stride)
939 attr.stride = RSEQ_MEMPOOL_STRIDE; /* Use default */
940 if (attr.robust_set && !attr.poison_set) {
941 attr.poison_set = true;
942 attr.poison = DEFAULT_POISON_VALUE;
943 }
944 if (item_len > attr.stride || attr.stride < (size_t) rseq_get_page_len() ||
945 !is_pow2(attr.stride)) {
946 errno = EINVAL;
947 return NULL;
948 }
949
950 pool = calloc(1, sizeof(struct rseq_mempool));
951 if (!pool)
952 return NULL;
953
954 memcpy(&pool->attr, &attr, sizeof(attr));
955 pthread_mutex_init(&pool->lock, NULL);
956 pool->item_len = item_len;
957 pool->item_order = order;
958
959 if (rseq_mempool_memfd_ref(pool))
960 goto error_alloc;
961
962 pool->range_list = rseq_mempool_range_create(pool);
963 if (!pool->range_list)
964 goto error_alloc;
965
966 if (pool_name) {
967 pool->name = strdup(pool_name);
968 if (!pool->name)
969 goto error_alloc;
970 }
971 return pool;
972
973 error_alloc:
974 rseq_mempool_destroy(pool);
975 errno = ENOMEM;
976 return NULL;
977 }
978
979 /* Always inline for __builtin_return_address(0). */
980 static inline __attribute__((always_inline))
981 void set_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, size_t item_offset)
982 {
983 unsigned long *bitmap = range->alloc_bitmap;
984 size_t item_index = item_offset >> pool->item_order;
985 unsigned long mask;
986 size_t k;
987
988 if (!bitmap)
989 return;
990
991 k = item_index / BIT_PER_ULONG;
992 mask = 1ULL << (item_index % BIT_PER_ULONG);
993
994 /* Print error if bit is already set. */
995 if (bitmap[k] & mask) {
996 fprintf(stderr, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
997 __func__, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0));
998 abort();
999 }
1000 bitmap[k] |= mask;
1001 }
1002
1003 static
1004 void __rseq_percpu *__rseq_percpu_malloc(struct rseq_mempool *pool,
1005 bool zeroed, void *init_ptr, size_t init_len)
1006 {
1007 struct rseq_mempool_range *range;
1008 struct free_list_node *node;
1009 uintptr_t item_offset;
1010 void __rseq_percpu *addr;
1011
1012 if (init_len > pool->item_len) {
1013 errno = EINVAL;
1014 return NULL;
1015 }
1016 pthread_mutex_lock(&pool->lock);
1017 /* Get first entry from free list. */
1018 node = pool->free_list_head;
1019 if (node != NULL) {
1020 void *range_base, *ptr;
1021
1022 ptr = __rseq_free_list_to_percpu_ptr(pool, node);
1023 range_base = (void *) ((uintptr_t) ptr & (~(pool->attr.stride - 1)));
1024 range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET);
1025 /* Remove node from free list (update head). */
1026 pool->free_list_head = node->next;
1027 item_offset = (uintptr_t) (ptr - range_base);
1028 rseq_percpu_check_poison_item(pool, range, item_offset);
1029 addr = __rseq_free_list_to_percpu_ptr(pool, node);
1030 goto end;
1031 }
1032 /*
1033 * If the most recent range (first in list) does not have any
1034 * room left, create a new range and prepend it to the list
1035 * head.
1036 */
1037 range = pool->range_list;
1038 if (range->next_unused + pool->item_len > pool->attr.stride) {
1039 range = rseq_mempool_range_create(pool);
1040 if (!range) {
1041 errno = ENOMEM;
1042 addr = NULL;
1043 goto end;
1044 }
1045 /* Add range to head of list. */
1046 range->next = pool->range_list;
1047 pool->range_list = range;
1048 }
1049 /* First range in list has room left. */
1050 item_offset = range->next_unused;
1051 addr = (void __rseq_percpu *) (range->base + item_offset);
1052 range->next_unused += pool->item_len;
1053 end:
1054 if (addr)
1055 set_alloc_slot(pool, range, item_offset);
1056 pthread_mutex_unlock(&pool->lock);
1057 if (addr) {
1058 if (zeroed)
1059 rseq_percpu_zero_item(pool, range, item_offset);
1060 else if (init_ptr) {
1061 rseq_percpu_init_item(pool, range, item_offset,
1062 init_ptr, init_len);
1063 }
1064 }
1065 return addr;
1066 }
1067
1068 void __rseq_percpu *rseq_mempool_percpu_malloc(struct rseq_mempool *pool)
1069 {
1070 return __rseq_percpu_malloc(pool, false, NULL, 0);
1071 }
1072
1073 void __rseq_percpu *rseq_mempool_percpu_zmalloc(struct rseq_mempool *pool)
1074 {
1075 return __rseq_percpu_malloc(pool, true, NULL, 0);
1076 }
1077
1078 void __rseq_percpu *rseq_mempool_percpu_malloc_init(struct rseq_mempool *pool,
1079 void *init_ptr, size_t len)
1080 {
1081 return __rseq_percpu_malloc(pool, false, init_ptr, len);
1082 }
1083
1084 /* Always inline for __builtin_return_address(0). */
1085 static inline __attribute__((always_inline))
1086 void clear_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, size_t item_offset)
1087 {
1088 unsigned long *bitmap = range->alloc_bitmap;
1089 size_t item_index = item_offset >> pool->item_order;
1090 unsigned long mask;
1091 size_t k;
1092
1093 if (!bitmap)
1094 return;
1095
1096 k = item_index / BIT_PER_ULONG;
1097 mask = 1ULL << (item_index % BIT_PER_ULONG);
1098
1099 /* Print error if bit is not set. */
1100 if (!(bitmap[k] & mask)) {
1101 fprintf(stderr, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
1102 __func__, get_pool_name(pool), pool, item_offset,
1103 (void *) __builtin_return_address(0));
1104 abort();
1105 }
1106 bitmap[k] &= ~mask;
1107 }
1108
1109 void librseq_mempool_percpu_free(void __rseq_percpu *_ptr, size_t stride)
1110 {
1111 uintptr_t ptr = (uintptr_t) _ptr;
1112 void *range_base = (void *) (ptr & (~(stride - 1)));
1113 struct rseq_mempool_range *range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET);
1114 struct rseq_mempool *pool = range->pool;
1115 uintptr_t item_offset = ptr & (stride - 1);
1116 struct free_list_node *head, *item;
1117
1118 pthread_mutex_lock(&pool->lock);
1119 clear_alloc_slot(pool, range, item_offset);
1120 /* Add ptr to head of free list */
1121 head = pool->free_list_head;
1122 if (pool->attr.poison_set)
1123 rseq_percpu_poison_item(pool, range, item_offset);
1124 item = __rseq_percpu_to_free_list_ptr(pool, _ptr);
1125 /*
1126 * Setting the next pointer will overwrite the first uintptr_t
1127 * poison for either CPU 0 (populate all) or init data (populate
1128 * none).
1129 */
1130 item->next = head;
1131 pool->free_list_head = item;
1132 pthread_mutex_unlock(&pool->lock);
1133 }
1134
1135 struct rseq_mempool_set *rseq_mempool_set_create(void)
1136 {
1137 struct rseq_mempool_set *pool_set;
1138
1139 pool_set = calloc(1, sizeof(struct rseq_mempool_set));
1140 if (!pool_set)
1141 return NULL;
1142 pthread_mutex_init(&pool_set->lock, NULL);
1143 return pool_set;
1144 }
1145
1146 int rseq_mempool_set_destroy(struct rseq_mempool_set *pool_set)
1147 {
1148 int order, ret;
1149
1150 for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) {
1151 struct rseq_mempool *pool = pool_set->entries[order];
1152
1153 if (!pool)
1154 continue;
1155 ret = rseq_mempool_destroy(pool);
1156 if (ret)
1157 return ret;
1158 pool_set->entries[order] = NULL;
1159 }
1160 pthread_mutex_destroy(&pool_set->lock);
1161 free(pool_set);
1162 return 0;
1163 }
1164
1165 /* Ownership of pool is handed over to pool set on success. */
1166 int rseq_mempool_set_add_pool(struct rseq_mempool_set *pool_set, struct rseq_mempool *pool)
1167 {
1168 size_t item_order = pool->item_order;
1169 int ret = 0;
1170
1171 pthread_mutex_lock(&pool_set->lock);
1172 if (pool_set->entries[item_order]) {
1173 errno = EBUSY;
1174 ret = -1;
1175 goto end;
1176 }
1177 pool_set->entries[pool->item_order] = pool;
1178 end:
1179 pthread_mutex_unlock(&pool_set->lock);
1180 return ret;
1181 }
1182
1183 static
1184 void __rseq_percpu *__rseq_mempool_set_malloc(struct rseq_mempool_set *pool_set,
1185 void *init_ptr, size_t len, bool zeroed)
1186 {
1187 int order, min_order = POOL_SET_MIN_ENTRY;
1188 struct rseq_mempool *pool;
1189 void __rseq_percpu *addr;
1190
1191 order = rseq_get_count_order_ulong(len);
1192 if (order > POOL_SET_MIN_ENTRY)
1193 min_order = order;
1194 again:
1195 pthread_mutex_lock(&pool_set->lock);
1196 /* First smallest present pool where @len fits. */
1197 for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) {
1198 pool = pool_set->entries[order];
1199
1200 if (!pool)
1201 continue;
1202 if (pool->item_len >= len)
1203 goto found;
1204 }
1205 pool = NULL;
1206 found:
1207 pthread_mutex_unlock(&pool_set->lock);
1208 if (pool) {
1209 addr = __rseq_percpu_malloc(pool, zeroed, init_ptr, len);
1210 if (addr == NULL && errno == ENOMEM) {
1211 /*
1212 * If the allocation failed, try again with a
1213 * larger pool.
1214 */
1215 min_order = order + 1;
1216 goto again;
1217 }
1218 } else {
1219 /* Not found. */
1220 errno = ENOMEM;
1221 addr = NULL;
1222 }
1223 return addr;
1224 }
1225
1226 void __rseq_percpu *rseq_mempool_set_percpu_malloc(struct rseq_mempool_set *pool_set, size_t len)
1227 {
1228 return __rseq_mempool_set_malloc(pool_set, NULL, len, false);
1229 }
1230
1231 void __rseq_percpu *rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set *pool_set, size_t len)
1232 {
1233 return __rseq_mempool_set_malloc(pool_set, NULL, len, true);
1234 }
1235
1236 void __rseq_percpu *rseq_mempool_set_percpu_malloc_init(struct rseq_mempool_set *pool_set,
1237 void *init_ptr, size_t len)
1238 {
1239 return __rseq_mempool_set_malloc(pool_set, init_ptr, len, true);
1240 }
1241
1242 struct rseq_mempool_attr *rseq_mempool_attr_create(void)
1243 {
1244 return calloc(1, sizeof(struct rseq_mempool_attr));
1245 }
1246
1247 void rseq_mempool_attr_destroy(struct rseq_mempool_attr *attr)
1248 {
1249 free(attr);
1250 }
1251
1252 int rseq_mempool_attr_set_mmap(struct rseq_mempool_attr *attr,
1253 void *(*mmap_func)(void *priv, size_t len),
1254 int (*munmap_func)(void *priv, void *ptr, size_t len),
1255 void *mmap_priv)
1256 {
1257 if (!attr) {
1258 errno = EINVAL;
1259 return -1;
1260 }
1261 attr->mmap_set = true;
1262 attr->mmap_func = mmap_func;
1263 attr->munmap_func = munmap_func;
1264 attr->mmap_priv = mmap_priv;
1265 return 0;
1266 }
1267
1268 int rseq_mempool_attr_set_init(struct rseq_mempool_attr *attr,
1269 int (*init_func)(void *priv, void *addr, size_t len, int cpu),
1270 void *init_priv)
1271 {
1272 if (!attr) {
1273 errno = EINVAL;
1274 return -1;
1275 }
1276 attr->init_set = true;
1277 attr->init_func = init_func;
1278 attr->init_priv = init_priv;
1279 return 0;
1280 }
1281
1282 int rseq_mempool_attr_set_robust(struct rseq_mempool_attr *attr)
1283 {
1284 if (!attr) {
1285 errno = EINVAL;
1286 return -1;
1287 }
1288 attr->robust_set = true;
1289 return 0;
1290 }
1291
1292 int rseq_mempool_attr_set_percpu(struct rseq_mempool_attr *attr,
1293 size_t stride, int max_nr_cpus)
1294 {
1295 if (!attr) {
1296 errno = EINVAL;
1297 return -1;
1298 }
1299 attr->type = MEMPOOL_TYPE_PERCPU;
1300 attr->stride = stride;
1301 attr->max_nr_cpus = max_nr_cpus;
1302 return 0;
1303 }
1304
1305 int rseq_mempool_attr_set_global(struct rseq_mempool_attr *attr,
1306 size_t stride)
1307 {
1308 if (!attr) {
1309 errno = EINVAL;
1310 return -1;
1311 }
1312 attr->type = MEMPOOL_TYPE_GLOBAL;
1313 attr->stride = stride;
1314 attr->max_nr_cpus = 0;
1315 return 0;
1316 }
1317
1318 int rseq_mempool_attr_set_max_nr_ranges(struct rseq_mempool_attr *attr,
1319 unsigned long max_nr_ranges)
1320 {
1321 if (!attr) {
1322 errno = EINVAL;
1323 return -1;
1324 }
1325 attr->max_nr_ranges = max_nr_ranges;
1326 return 0;
1327 }
1328
1329 int rseq_mempool_attr_set_poison(struct rseq_mempool_attr *attr,
1330 uintptr_t poison)
1331 {
1332 if (!attr) {
1333 errno = EINVAL;
1334 return -1;
1335 }
1336 attr->poison_set = true;
1337 attr->poison = poison;
1338 return 0;
1339 }
1340
1341 int rseq_mempool_attr_set_populate_policy(struct rseq_mempool_attr *attr,
1342 enum rseq_mempool_populate_policy policy)
1343 {
1344 if (!attr) {
1345 errno = EINVAL;
1346 return -1;
1347 }
1348 attr->populate_policy = policy;
1349 return 0;
1350 }
1351
1352 int rseq_mempool_get_max_nr_cpus(struct rseq_mempool *mempool)
1353 {
1354 if (!mempool || mempool->attr.type != MEMPOOL_TYPE_PERCPU) {
1355 errno = EINVAL;
1356 return -1;
1357 }
1358 return mempool->attr.max_nr_cpus;
1359 }
This page took 0.07043 seconds and 4 git commands to generate.