X-Git-Url: http://drtracing.org/?a=blobdiff_plain;f=src%2Frseq-mempool.c;h=c5b46178cdf7b8efcab00fbef7d4b499086a62ec;hb=805d0043db4c6d645a783c1a994d2d43a8e946e1;hp=1562a75a4ea25b404d15707039046dc85792727d;hpb=6fbf1fb6d647865da78d05007721cc983c8616aa;p=librseq.git diff --git a/src/rseq-mempool.c b/src/rseq-mempool.c index 1562a75..c5b4617 100644 --- a/src/rseq-mempool.c +++ b/src/rseq-mempool.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: MIT // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers +// SPDX-FileCopyrightText: 2024 Olivier Dion #include #include @@ -13,6 +14,7 @@ #include #include #include +#include #ifdef HAVE_LIBNUMA # include @@ -37,6 +39,8 @@ #define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG +#define POOL_HEADER_NR_PAGES 2 + /* * Smallest allocation should hold enough space for a free list pointer. */ @@ -52,6 +56,18 @@ #define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range) +#if RSEQ_BITS_PER_LONG == 64 +# define DEFAULT_COW_INIT_POISON_VALUE 0x5555555555555555ULL +#else +# define DEFAULT_COW_INIT_POISON_VALUE 0x55555555UL +#endif + +/* + * Define the default COW_ZERO poison value as zero to prevent useless + * COW page allocation when writing poison values when freeing items. + */ +#define DEFAULT_COW_ZERO_POISON_VALUE 0x0 + struct free_list_node; struct free_list_node { @@ -64,11 +80,6 @@ enum mempool_type { }; struct rseq_mempool_attr { - bool mmap_set; - void *(*mmap_func)(void *priv, size_t len); - int (*munmap_func)(void *priv, void *ptr, size_t len); - void *mmap_priv; - bool init_set; int (*init_func)(void *priv, void *addr, size_t len, int cpu); void *init_priv; @@ -83,6 +94,8 @@ struct rseq_mempool_attr { bool poison_set; uintptr_t poison; + + enum rseq_mempool_populate_policy populate_policy; }; struct rseq_mempool_range; @@ -90,9 +103,40 @@ struct rseq_mempool_range; struct rseq_mempool_range { struct rseq_mempool_range *next; /* Linked list of ranges. */ struct rseq_mempool *pool; /* Backward reference to container pool. */ + + /* + * Memory layout of a mempool range: + * - Canary header page (for detection of destroy-after-fork of + * COW_INIT pool), + * - Header page (contains struct rseq_mempool_range at the + * very end), + * - Base of the per-cpu data, starting with CPU 0. + * Aliases with free-list for non-robust COW_ZERO pool. + * - CPU 1, + * ... + * - CPU max_nr_cpus - 1 + * - init values (only allocated for COW_INIT pool). + * Aliases with free-list for non-robust COW_INIT pool. + * - free list (for robust pool). + * + * The free list aliases the CPU 0 memory area for non-robust + * COW_ZERO pools. It aliases with init values for non-robust + * COW_INIT pools. It is located immediately after the init + * values for robust pools. + */ void *header; void *base; + /* + * The init values contains malloc_init/zmalloc values. + * Pointer is NULL for RSEQ_MEMPOOL_POPULATE_COW_ZERO. + */ + void *init; size_t next_unused; + + /* Pool range mmap/munmap */ + void *mmap_addr; + size_t mmap_len; + /* Track alloc/free. */ unsigned long *alloc_bitmap; }; @@ -106,11 +150,21 @@ struct rseq_mempool { int item_order; /* - * The free list chains freed items on the CPU 0 address range. - * We should rethink this decision if false sharing between - * malloc/free from other CPUs and data accesses from CPU 0 - * becomes an issue. This is a NULL-terminated singly-linked - * list. + * COW_INIT non-robust pools: + * The free list chains freed items on the init + * values address range. + * + * COW_ZERO non-robust pools: + * The free list chains freed items on the CPU 0 + * address range. We should rethink this + * decision if false sharing between malloc/free + * from other CPUs and data accesses from CPU 0 + * becomes an issue. + * + * Robust pools: The free list chains freed items in the + * address range dedicated for the free list. + * + * This is a NULL-terminated singly-linked list. */ struct free_list_node *free_list_head; @@ -145,64 +199,206 @@ void *__rseq_pool_range_percpu_ptr(const struct rseq_mempool_range *range, int c return range->base + (stride * cpu) + item_offset; } +static +void *__rseq_pool_range_init_ptr(const struct rseq_mempool_range *range, + uintptr_t item_offset) +{ + if (!range->init) + return NULL; + return range->init + item_offset; +} + +static +void __rseq_percpu *__rseq_free_list_to_percpu_ptr(const struct rseq_mempool *pool, + struct free_list_node *node) +{ + void __rseq_percpu *p = (void __rseq_percpu *) node; + + if (pool->attr.robust_set) { + /* Skip cpus. */ + p -= pool->attr.max_nr_cpus * pool->attr.stride; + /* Skip init values */ + if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT) + p -= pool->attr.stride; + + } else { + /* COW_INIT free list is in init values */ + if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT) + p -= pool->attr.max_nr_cpus * pool->attr.stride; + } + return p; +} + +static +struct free_list_node *__rseq_percpu_to_free_list_ptr(const struct rseq_mempool *pool, + void __rseq_percpu *p) +{ + if (pool->attr.robust_set) { + /* Skip cpus. */ + p += pool->attr.max_nr_cpus * pool->attr.stride; + /* Skip init values */ + if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT) + p += pool->attr.stride; + + } else { + /* COW_INIT free list is in init values */ + if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT) + p += pool->attr.max_nr_cpus * pool->attr.stride; + } + return (struct free_list_node *) p; +} + +static +intptr_t rseq_cmp_item(void *p, size_t item_len, intptr_t cmp_value, intptr_t *unexpected_value) +{ + size_t offset; + intptr_t res = 0; + + for (offset = 0; offset < item_len; offset += sizeof(uintptr_t)) { + intptr_t v = *((intptr_t *) (p + offset)); + + if ((res = v - cmp_value) != 0) { + if (unexpected_value) + *unexpected_value = v; + break; + } + } + return res; +} + static void rseq_percpu_zero_item(struct rseq_mempool *pool, struct rseq_mempool_range *range, uintptr_t item_offset) { + char *init_p = NULL; int i; + init_p = __rseq_pool_range_init_ptr(range, item_offset); + if (init_p) + bzero(init_p, pool->item_len); for (i = 0; i < pool->attr.max_nr_cpus; i++) { char *p = __rseq_pool_range_percpu_ptr(range, i, item_offset, pool->attr.stride); - memset(p, 0, pool->item_len); + + /* + * If item is already zeroed, either because the + * init range update has propagated or because the + * content is already zeroed (e.g. zero page), don't + * write to the page. This eliminates useless COW over + * the zero page just for overwriting it with zeroes. + * + * This means zmalloc() in COW_ZERO policy pool do + * not trigger COW for CPUs which are not actively + * writing to the pool. This is however not the case for + * malloc_init() in populate-all pools if it populates + * non-zero content. + */ + if (!rseq_cmp_item(p, pool->item_len, 0, NULL)) + continue; + bzero(p, pool->item_len); } } +static +void rseq_percpu_init_item(struct rseq_mempool *pool, + struct rseq_mempool_range *range, uintptr_t item_offset, + void *init_ptr, size_t init_len) +{ + char *init_p = NULL; + int i; + + init_p = __rseq_pool_range_init_ptr(range, item_offset); + if (init_p) + memcpy(init_p, init_ptr, init_len); + for (i = 0; i < pool->attr.max_nr_cpus; i++) { + char *p = __rseq_pool_range_percpu_ptr(range, i, + item_offset, pool->attr.stride); + + /* + * If the update propagated through a shared mapping, + * or the item already has the correct content, skip + * writing it into the cpu item to eliminate useless + * COW of the page. + */ + if (!memcmp(init_ptr, p, init_len)) + continue; + memcpy(p, init_ptr, init_len); + } +} + +static +void rseq_poison_item(void *p, size_t item_len, uintptr_t poison) +{ + size_t offset; + + for (offset = 0; offset < item_len; offset += sizeof(uintptr_t)) + *((uintptr_t *) (p + offset)) = poison; +} + static void rseq_percpu_poison_item(struct rseq_mempool *pool, struct rseq_mempool_range *range, uintptr_t item_offset) { uintptr_t poison = pool->attr.poison; + char *init_p = NULL; int i; + init_p = __rseq_pool_range_init_ptr(range, item_offset); + if (init_p) + rseq_poison_item(init_p, pool->item_len, poison); for (i = 0; i < pool->attr.max_nr_cpus; i++) { char *p = __rseq_pool_range_percpu_ptr(range, i, item_offset, pool->attr.stride); - size_t offset; - for (offset = 0; offset < pool->item_len; offset += sizeof(uintptr_t)) - *((uintptr_t *) (p + offset)) = poison; + /* + * If the update propagated through a shared mapping, + * or the item already has the correct content, skip + * writing it into the cpu item to eliminate useless + * COW of the page. + * + * It is recommended to use zero as poison value for + * COW_ZERO pools to eliminate COW due to writing + * poison to CPU memory still backed by the zero page. + */ + if (rseq_cmp_item(p, pool->item_len, poison, NULL) == 0) + continue; + rseq_poison_item(p, pool->item_len, poison); } } +/* Always inline for __builtin_return_address(0). */ +static inline __attribute__((always_inline)) +void rseq_check_poison_item(const struct rseq_mempool *pool, uintptr_t item_offset, + void *p, size_t item_len, uintptr_t poison) +{ + intptr_t unexpected_value; + + if (rseq_cmp_item(p, item_len, poison, &unexpected_value) == 0) + return; + + fprintf(stderr, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n", + __func__, (unsigned long) unexpected_value, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0)); + abort(); +} + /* Always inline for __builtin_return_address(0). */ static inline __attribute__((always_inline)) void rseq_percpu_check_poison_item(const struct rseq_mempool *pool, const struct rseq_mempool_range *range, uintptr_t item_offset) { uintptr_t poison = pool->attr.poison; + char *init_p; int i; - if (!pool->attr.robust_set || !pool->attr.poison_set) + if (!pool->attr.robust_set) return; + init_p = __rseq_pool_range_init_ptr(range, item_offset); + if (init_p) + rseq_check_poison_item(pool, item_offset, init_p, pool->item_len, poison); for (i = 0; i < pool->attr.max_nr_cpus; i++) { char *p = __rseq_pool_range_percpu_ptr(range, i, item_offset, pool->attr.stride); - size_t offset; - - for (offset = 0; offset < pool->item_len; offset += sizeof(uintptr_t)) { - uintptr_t v; - - /* Skip poison check for free-list pointer. */ - if (i == 0 && offset == 0) - continue; - v = *((uintptr_t *) (p + offset)); - if (v != poison) { - fprintf(stderr, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n", - __func__, (unsigned long) v, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0)); - abort(); - } - } + rseq_check_poison_item(pool, item_offset, p, pool->item_len, poison); } } @@ -272,24 +468,6 @@ int rseq_mempool_range_init_numa(void *addr __attribute__((unused)), } #endif -static -void *default_mmap_func(void *priv __attribute__((unused)), size_t len) -{ - void *base; - - base = mmap(NULL, len, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (base == MAP_FAILED) - return NULL; - return base; -} - -static -int default_munmap_func(void *priv __attribute__((unused)), void *ptr, size_t len) -{ - return munmap(ptr, len); -} - static int create_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range) { @@ -308,9 +486,10 @@ int create_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *ra } static -bool addr_in_pool(const struct rseq_mempool *pool, void *addr) +bool percpu_addr_in_pool(const struct rseq_mempool *pool, void __rseq_percpu *_addr) { struct rseq_mempool_range *range; + void *addr = (void *) _addr; for (range = pool->range_list; range; range = range->next) { if (addr >= range->base && addr < range->base + range->next_unused) @@ -321,13 +500,13 @@ bool addr_in_pool(const struct rseq_mempool *pool, void *addr) /* Always inline for __builtin_return_address(0). */ static inline __attribute__((always_inline)) -void check_free_list(const struct rseq_mempool *pool) +void check_free_list(const struct rseq_mempool *pool, bool mapping_accessible) { size_t total_item = 0, total_never_allocated = 0, total_freed = 0, max_list_traversal = 0, traversal_iteration = 0; struct rseq_mempool_range *range; - if (!pool->attr.robust_set) + if (!pool->attr.robust_set || !mapping_accessible) return; for (range = pool->range_list; range; range = range->next) { @@ -341,8 +520,6 @@ void check_free_list(const struct rseq_mempool *pool) prev = node, node = node->next) { - void *node_addr = node; - if (traversal_iteration >= max_list_traversal) { fprintf(stderr, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n", __func__, get_pool_name(pool), pool, __builtin_return_address(0)); @@ -350,7 +527,7 @@ void check_free_list(const struct rseq_mempool *pool) } /* Node is out of range. */ - if (!addr_in_pool(pool, node_addr)) { + if (!percpu_addr_in_pool(pool, __rseq_free_list_to_percpu_ptr(pool, node))) { if (prev) fprintf(stderr, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n", __func__, prev, node, get_pool_name(pool), pool, __builtin_return_address(0)); @@ -385,11 +562,11 @@ void check_range_poison(const struct rseq_mempool *pool, /* Always inline for __builtin_return_address(0). */ static inline __attribute__((always_inline)) -void check_pool_poison(const struct rseq_mempool *pool) +void check_pool_poison(const struct rseq_mempool *pool, bool mapping_accessible) { struct rseq_mempool_range *range; - if (!pool->attr.robust_set || !pool->attr.poison_set) + if (!pool->attr.robust_set || !mapping_accessible) return; for (range = pool->range_list; range; range = range->next) check_range_poison(pool, range); @@ -417,17 +594,24 @@ void destroy_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range * } free(bitmap); + range->alloc_bitmap = NULL; } /* Always inline for __builtin_return_address(0). */ static inline __attribute__((always_inline)) int rseq_mempool_range_destroy(struct rseq_mempool *pool, - struct rseq_mempool_range *range) + struct rseq_mempool_range *range, + bool mapping_accessible) { destroy_alloc_bitmap(pool, range); - /* range is a header located one page before the aligned mapping. */ - return pool->attr.munmap_func(pool->attr.mmap_priv, range->header, - (pool->attr.stride * pool->attr.max_nr_cpus) + rseq_get_page_len()); + if (!mapping_accessible) { + /* + * Only the header pages are populated in the child + * process. + */ + return munmap(range->header, POOL_HEADER_NR_PAGES * rseq_get_page_len()); + } + return munmap(range->mmap_addr, range->mmap_len); } /* @@ -435,8 +619,7 @@ int rseq_mempool_range_destroy(struct rseq_mempool *pool, * @pre_header before the mapping. */ static -void *aligned_mmap_anonymous(struct rseq_mempool *pool, - size_t page_size, size_t len, size_t alignment, +void *aligned_mmap_anonymous(size_t page_size, size_t len, size_t alignment, void **pre_header, size_t pre_header_len) { size_t minimum_page_count, page_count, extra, total_allocate = 0; @@ -463,9 +646,12 @@ void *aligned_mmap_anonymous(struct rseq_mempool *pool, assert(page_count >= minimum_page_count); - ptr = pool->attr.mmap_func(pool->attr.mmap_priv, page_count << page_order); - if (!ptr) + ptr = mmap(NULL, page_count << page_order, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (ptr == MAP_FAILED) { + ptr = NULL; goto alloc_error; + } total_allocate = page_count << page_order; @@ -477,7 +663,7 @@ void *aligned_mmap_anonymous(struct rseq_mempool *pool, /* Unmap extra before. */ extra = offset_align((uintptr_t) ptr + pre_header_len, alignment); assert(!(extra & (page_size - 1))); - if (pool->attr.munmap_func(pool->attr.mmap_priv, ptr, extra)) { + if (munmap(ptr, extra)) { perror("munmap"); abort(); } @@ -493,7 +679,7 @@ out: /* Unmap extra after. */ extra_ptr = ptr + (minimum_page_count << page_order); extra = (page_count - minimum_page_count) << page_order; - if (pool->attr.munmap_func(pool->attr.mmap_priv, extra_ptr, extra)) { + if (munmap(extra_ptr, extra)) { perror("munmap"); abort(); } @@ -512,6 +698,44 @@ alloc_error: return ptr; } +static +int rseq_memfd_create_init(const char *poolname, size_t init_len) +{ + int fd; + char buf[249]; /* Limit is 249 bytes. */ + const char *name; + + if (poolname) { + snprintf(buf, sizeof(buf), "%s:rseq-mempool", poolname); + name = buf; + } else { + name = ":rseq-mempool"; + } + + fd = memfd_create(name, MFD_CLOEXEC); + if (fd < 0) { + perror("memfd_create"); + goto end; + } + if (ftruncate(fd, (off_t) init_len)) { + if (close(fd)) + perror("close"); + fd = -1; + goto end; + } +end: + return fd; +} + +static +void rseq_memfd_close(int fd) +{ + if (fd < 0) + return; + if (close(fd)) + perror("close"); +} + static struct rseq_mempool_range *rseq_mempool_range_create(struct rseq_mempool *pool) { @@ -519,6 +743,9 @@ struct rseq_mempool_range *rseq_mempool_range_create(struct rseq_mempool *pool) unsigned long page_size; void *header; void *base; + size_t range_len; /* Range len excludes header. */ + size_t header_len; + int memfd = -1; if (pool->attr.max_nr_ranges && pool->nr_ranges >= pool->attr.max_nr_ranges) { @@ -527,16 +754,76 @@ struct rseq_mempool_range *rseq_mempool_range_create(struct rseq_mempool *pool) } page_size = rseq_get_page_len(); - base = aligned_mmap_anonymous(pool, page_size, - pool->attr.stride * pool->attr.max_nr_cpus, - pool->attr.stride, - &header, page_size); + header_len = POOL_HEADER_NR_PAGES * page_size; + range_len = pool->attr.stride * pool->attr.max_nr_cpus; + if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT) + range_len += pool->attr.stride; /* init values */ + if (pool->attr.robust_set) + range_len += pool->attr.stride; /* dedicated free list */ + base = aligned_mmap_anonymous(page_size, range_len, + pool->attr.stride, &header, header_len); if (!base) return NULL; range = (struct rseq_mempool_range *) (base - RANGE_HEADER_OFFSET); range->pool = pool; - range->base = base; range->header = header; + range->base = base; + range->mmap_addr = header; + range->mmap_len = header_len + range_len; + + if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT) { + range->init = base + (pool->attr.stride * pool->attr.max_nr_cpus); + /* Populate init values pages from memfd */ + memfd = rseq_memfd_create_init(pool->name, pool->attr.stride); + if (memfd < 0) + goto error_alloc; + if (mmap(range->init, pool->attr.stride, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, memfd, 0) != (void *) range->init) + goto error_alloc; + assert(pool->attr.type == MEMPOOL_TYPE_PERCPU); + /* + * Map per-cpu memory as private COW mappings of init values. + */ + { + int cpu; + + for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) { + void *p = base + (pool->attr.stride * cpu); + size_t len = pool->attr.stride; + + if (mmap(p, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED, + memfd, 0) != (void *) p) + goto error_alloc; + } + } + /* + * The init values shared mapping should not be shared + * with the children processes across fork. Prevent the + * whole mapping from being used across fork. + */ + if (madvise(base, range_len, MADV_DONTFORK)) + goto error_alloc; + + /* + * Write 0x1 in first byte of header first page, which + * will be WIPEONFORK (and thus cleared) in children + * processes. Used to find out if pool destroy is called + * from a child process after fork. + */ + *((char *) header) = 0x1; + if (madvise(header, page_size, MADV_WIPEONFORK)) + goto error_alloc; + + /* + * The second header page contains the struct + * rseq_mempool_range, which is needed by pool destroy. + * Leave this anonymous page populated (COW) in child + * processes. + */ + rseq_memfd_close(memfd); + memfd = -1; + } + if (pool->attr.robust_set) { if (create_alloc_bitmap(pool, range)) goto error_alloc; @@ -569,29 +856,67 @@ struct rseq_mempool_range *rseq_mempool_range_create(struct rseq_mempool *pool) return range; error_alloc: - (void) rseq_mempool_range_destroy(pool, range); + rseq_memfd_close(memfd); + (void) rseq_mempool_range_destroy(pool, range, true); return NULL; } +static +bool pool_mappings_accessible(struct rseq_mempool *pool) +{ + struct rseq_mempool_range *range; + size_t page_size; + char *addr; + + if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_COW_INIT) + return true; + range = pool->range_list; + if (!range) + return true; + page_size = rseq_get_page_len(); + /* + * Header first page is one page before the page containing the + * range structure. + */ + addr = (char *) ((uintptr_t) range & ~(page_size - 1)) - page_size; + /* + * Look for 0x1 first byte marker in header first page. + */ + if (*addr != 0x1) + return false; + return true; +} + int rseq_mempool_destroy(struct rseq_mempool *pool) { struct rseq_mempool_range *range, *next_range; + bool mapping_accessible; int ret = 0; if (!pool) return 0; - check_free_list(pool); - check_pool_poison(pool); + + /* + * Validate that the pool mappings are accessible before doing + * free list/poison validation and unmapping ranges. This allows + * calling pool destroy in child process after a fork for COW_INIT + * pools to free pool resources. + */ + mapping_accessible = pool_mappings_accessible(pool); + + check_free_list(pool, mapping_accessible); + check_pool_poison(pool, mapping_accessible); + /* Iteration safe against removal. */ for (range = pool->range_list; range && (next_range = range->next, 1); range = next_range) { - if (rseq_mempool_range_destroy(pool, range)) + if (rseq_mempool_range_destroy(pool, range, mapping_accessible)) goto end; /* Update list head to keep list coherent in case of partial failure. */ pool->range_list = next_range; } pthread_mutex_destroy(&pool->lock); free(pool->name); - memset(pool, 0, sizeof(*pool)); + free(pool); end: return ret; } @@ -617,10 +942,18 @@ struct rseq_mempool *rseq_mempool_create(const char *pool_name, if (_attr) memcpy(&attr, _attr, sizeof(attr)); - if (!attr.mmap_set) { - attr.mmap_func = default_mmap_func; - attr.munmap_func = default_munmap_func; - attr.mmap_priv = NULL; + + /* + * Validate that the pool populate policy requested is known. + */ + switch (attr.populate_policy) { + case RSEQ_MEMPOOL_POPULATE_COW_INIT: + break; + case RSEQ_MEMPOOL_POPULATE_COW_ZERO: + break; + default: + errno = EINVAL; + return NULL; } switch (attr.type) { @@ -639,12 +972,22 @@ struct rseq_mempool *rseq_mempool_create(const char *pool_name, } break; case MEMPOOL_TYPE_GLOBAL: + /* Override populate policy for global type. */ + if (attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT) + attr.populate_policy = RSEQ_MEMPOOL_POPULATE_COW_ZERO; /* Use a 1-cpu pool for global mempool type. */ attr.max_nr_cpus = 1; break; } if (!attr.stride) attr.stride = RSEQ_MEMPOOL_STRIDE; /* Use default */ + if (attr.robust_set && !attr.poison_set) { + attr.poison_set = true; + if (attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT) + attr.poison = DEFAULT_COW_INIT_POISON_VALUE; + else + attr.poison = DEFAULT_COW_ZERO_POISON_VALUE; + } if (item_len > attr.stride || attr.stride < (size_t) rseq_get_page_len() || !is_pow2(attr.stride)) { errno = EINVAL; @@ -702,26 +1045,32 @@ void set_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, } static -void __rseq_percpu *__rseq_percpu_malloc(struct rseq_mempool *pool, bool zeroed) +void __rseq_percpu *__rseq_percpu_malloc(struct rseq_mempool *pool, + bool zeroed, void *init_ptr, size_t init_len) { struct rseq_mempool_range *range; struct free_list_node *node; uintptr_t item_offset; void __rseq_percpu *addr; + if (init_len > pool->item_len) { + errno = EINVAL; + return NULL; + } pthread_mutex_lock(&pool->lock); /* Get first entry from free list. */ node = pool->free_list_head; if (node != NULL) { - uintptr_t ptr = (uintptr_t) node; - void *range_base = (void *) (ptr & (~(pool->attr.stride - 1))); + void *range_base, *ptr; + ptr = __rseq_free_list_to_percpu_ptr(pool, node); + range_base = (void *) ((uintptr_t) ptr & (~(pool->attr.stride - 1))); range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET); /* Remove node from free list (update head). */ pool->free_list_head = node->next; - item_offset = (uintptr_t) ((void *) node - range_base); + item_offset = (uintptr_t) (ptr - range_base); rseq_percpu_check_poison_item(pool, range, item_offset); - addr = (void __rseq_percpu *) node; + addr = __rseq_free_list_to_percpu_ptr(pool, node); goto end; } /* @@ -749,19 +1098,31 @@ end: if (addr) set_alloc_slot(pool, range, item_offset); pthread_mutex_unlock(&pool->lock); - if (zeroed && addr) - rseq_percpu_zero_item(pool, range, item_offset); + if (addr) { + if (zeroed) + rseq_percpu_zero_item(pool, range, item_offset); + else if (init_ptr) { + rseq_percpu_init_item(pool, range, item_offset, + init_ptr, init_len); + } + } return addr; } void __rseq_percpu *rseq_mempool_percpu_malloc(struct rseq_mempool *pool) { - return __rseq_percpu_malloc(pool, false); + return __rseq_percpu_malloc(pool, false, NULL, 0); } void __rseq_percpu *rseq_mempool_percpu_zmalloc(struct rseq_mempool *pool) { - return __rseq_percpu_malloc(pool, true); + return __rseq_percpu_malloc(pool, true, NULL, 0); +} + +void __rseq_percpu *rseq_mempool_percpu_malloc_init(struct rseq_mempool *pool, + void *init_ptr, size_t len) +{ + return __rseq_percpu_malloc(pool, false, init_ptr, len); } /* Always inline for __builtin_return_address(0). */ @@ -804,11 +1165,11 @@ void librseq_mempool_percpu_free(void __rseq_percpu *_ptr, size_t stride) head = pool->free_list_head; if (pool->attr.poison_set) rseq_percpu_poison_item(pool, range, item_offset); - /* Free-list is in CPU 0 range. */ - item = (struct free_list_node *) ptr; + item = __rseq_percpu_to_free_list_ptr(pool, _ptr); /* * Setting the next pointer will overwrite the first uintptr_t - * poison for CPU 0. + * poison for either CPU 0 (COW_ZERO, non-robust), or init data + * (COW_INIT, non-robust). */ item->next = head; pool->free_list_head = item; @@ -864,7 +1225,8 @@ end: } static -void __rseq_percpu *__rseq_mempool_set_malloc(struct rseq_mempool_set *pool_set, size_t len, bool zeroed) +void __rseq_percpu *__rseq_mempool_set_malloc(struct rseq_mempool_set *pool_set, + void *init_ptr, size_t len, bool zeroed) { int order, min_order = POOL_SET_MIN_ENTRY; struct rseq_mempool *pool; @@ -888,7 +1250,7 @@ again: found: pthread_mutex_unlock(&pool_set->lock); if (pool) { - addr = __rseq_percpu_malloc(pool, zeroed); + addr = __rseq_percpu_malloc(pool, zeroed, init_ptr, len); if (addr == NULL && errno == ENOMEM) { /* * If the allocation failed, try again with a @@ -907,12 +1269,18 @@ found: void __rseq_percpu *rseq_mempool_set_percpu_malloc(struct rseq_mempool_set *pool_set, size_t len) { - return __rseq_mempool_set_malloc(pool_set, len, false); + return __rseq_mempool_set_malloc(pool_set, NULL, len, false); } void __rseq_percpu *rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set *pool_set, size_t len) { - return __rseq_mempool_set_malloc(pool_set, len, true); + return __rseq_mempool_set_malloc(pool_set, NULL, len, true); +} + +void __rseq_percpu *rseq_mempool_set_percpu_malloc_init(struct rseq_mempool_set *pool_set, + void *init_ptr, size_t len) +{ + return __rseq_mempool_set_malloc(pool_set, init_ptr, len, true); } struct rseq_mempool_attr *rseq_mempool_attr_create(void) @@ -925,22 +1293,6 @@ void rseq_mempool_attr_destroy(struct rseq_mempool_attr *attr) free(attr); } -int rseq_mempool_attr_set_mmap(struct rseq_mempool_attr *attr, - void *(*mmap_func)(void *priv, size_t len), - int (*munmap_func)(void *priv, void *ptr, size_t len), - void *mmap_priv) -{ - if (!attr) { - errno = EINVAL; - return -1; - } - attr->mmap_set = true; - attr->mmap_func = mmap_func; - attr->munmap_func = munmap_func; - attr->mmap_priv = mmap_priv; - return 0; -} - int rseq_mempool_attr_set_init(struct rseq_mempool_attr *attr, int (*init_func)(void *priv, void *addr, size_t len, int cpu), void *init_priv) @@ -952,6 +1304,7 @@ int rseq_mempool_attr_set_init(struct rseq_mempool_attr *attr, attr->init_set = true; attr->init_func = init_func; attr->init_priv = init_priv; + attr->populate_policy = RSEQ_MEMPOOL_POPULATE_COW_INIT; return 0; } @@ -1014,6 +1367,17 @@ int rseq_mempool_attr_set_poison(struct rseq_mempool_attr *attr, return 0; } +int rseq_mempool_attr_set_populate_policy(struct rseq_mempool_attr *attr, + enum rseq_mempool_populate_policy policy) +{ + if (!attr) { + errno = EINVAL; + return -1; + } + attr->populate_policy = policy; + return 0; +} + int rseq_mempool_get_max_nr_cpus(struct rseq_mempool *mempool) { if (!mempool || mempool->attr.type != MEMPOOL_TYPE_PERCPU) {