/*
* Memory layout of a mempool range:
* - Header page (contains struct rseq_mempool_range at the very end),
- * - Base of the per-cpu data, starting with CPU 0,
+ * - Base of the per-cpu data, starting with CPU 0.
+ * Aliases with free-list for non-robust populate all pool.
* - CPU 1,
* ...
* - CPU max_nr_cpus - 1
* - init values (unpopulated for RSEQ_MEMPOOL_POPULATE_ALL).
+ * Aliases with free-list for non-robust populate none pool.
+ * - free list (for robust pool).
+ *
+ * The free list aliases the CPU 0 memory area for non-robust
+ * populate all pools. It aliases with init values for
+ * non-robust populate none pools. It is located immediately
+ * after the init values for robust pools.
*/
void *header;
void *base;
{
void __rseq_percpu *p = (void __rseq_percpu *) node;
- if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL)
+ if (pool->attr.robust_set) {
+ /* Skip cpus. */
p -= pool->attr.max_nr_cpus * pool->attr.stride;
+ /* Skip init values */
+ if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL)
+ p -= pool->attr.stride;
+
+ } else {
+ /* Populate none free list is in init values */
+ if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL)
+ p -= pool->attr.max_nr_cpus * pool->attr.stride;
+ }
return p;
}
struct free_list_node *__rseq_percpu_to_free_list_ptr(const struct rseq_mempool *pool,
void __rseq_percpu *p)
{
- if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL)
+ if (pool->attr.robust_set) {
+ /* Skip cpus. */
p += pool->attr.max_nr_cpus * pool->attr.stride;
+ /* Skip init values */
+ if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL)
+ p += pool->attr.stride;
+
+ } else {
+ /* Populate none free list is in init values */
+ if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL)
+ p += pool->attr.max_nr_cpus * pool->attr.stride;
+ }
return (struct free_list_node *) p;
}
+static
+off_t ptr_to_off_t(void *p)
+{
+ return (off_t) (uintptr_t) p;
+}
+
static
int memcmpbyte(const char *s, int c, size_t n)
{
char *p = __rseq_pool_range_percpu_ptr(range, i,
item_offset, pool->attr.stride);
- /* Update propagated */
- if (init_p && !memcmpbyte(p, 0, pool->item_len))
+ /*
+ * If item is already zeroed, either because the
+ * init range update has propagated or because the
+ * content is already zeroed (e.g. zero page), don't
+ * write to the page. This eliminates useless COW over
+ * the zero page just for overwriting it with zeroes.
+ *
+ * This means zmalloc() in populate all policy pool do
+ * not trigger COW for CPUs which are not actively
+ * writing to the pool. This is however not the case for
+ * malloc_init() in populate-all pools if it populates
+ * non-zero content.
+ */
+ if (!memcmpbyte(p, 0, pool->item_len))
continue;
memset(p, 0, pool->item_len);
}
char *p = __rseq_pool_range_percpu_ptr(range, i,
item_offset, pool->attr.stride);
- /* Update propagated */
- if (init_p && !memcmp(init_p, p, init_len))
+ /*
+ * If the update propagated through a shared mapping,
+ * or the item already has the correct content, skip
+ * writing it into the cpu item to eliminate useless
+ * COW of the page.
+ */
+ if (!memcmp(init_ptr, p, init_len))
continue;
memcpy(p, init_ptr, init_len);
}
*((uintptr_t *) (p + offset)) = poison;
}
+static
+intptr_t rseq_cmp_poison_item(void *p, size_t item_len, uintptr_t poison, intptr_t *unexpected_value)
+{
+ size_t offset;
+ intptr_t res = 0;
+
+ for (offset = 0; offset < item_len; offset += sizeof(uintptr_t)) {
+ intptr_t v = *((intptr_t *) (p + offset));
+
+ if ((res = v - (intptr_t) poison) != 0) {
+ if (unexpected_value)
+ *unexpected_value = v;
+ break;
+ }
+ }
+ return res;
+}
+
static
void rseq_percpu_poison_item(struct rseq_mempool *pool,
struct rseq_mempool_range *range, uintptr_t item_offset)
char *p = __rseq_pool_range_percpu_ptr(range, i,
item_offset, pool->attr.stride);
- /* Update propagated */
- if (init_p && !memcmp(init_p, p, pool->item_len))
+ /*
+ * If the update propagated through a shared mapping,
+ * or the item already has the correct content, skip
+ * writing it into the cpu item to eliminate useless
+ * COW of the page.
+ *
+ * It is recommended to use zero as poison value for
+ * populate-all pools to eliminate COW due to writing
+ * poison to unused CPU memory.
+ */
+ if (rseq_cmp_poison_item(p, pool->item_len, poison, NULL) == 0)
continue;
rseq_poison_item(p, pool->item_len, poison);
}
/* Always inline for __builtin_return_address(0). */
static inline __attribute__((always_inline))
void rseq_check_poison_item(const struct rseq_mempool *pool, uintptr_t item_offset,
- void *p, size_t item_len, uintptr_t poison, bool skip_freelist_ptr)
+ void *p, size_t item_len, uintptr_t poison)
{
- size_t offset;
+ intptr_t unexpected_value;
- for (offset = 0; offset < item_len; offset += sizeof(uintptr_t)) {
- uintptr_t v;
+ if (rseq_cmp_poison_item(p, item_len, poison, &unexpected_value) == 0)
+ return;
- /* Skip poison check for free-list pointer. */
- if (skip_freelist_ptr && offset == 0)
- continue;
- v = *((uintptr_t *) (p + offset));
- if (v != poison) {
- fprintf(stderr, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
- __func__, (unsigned long) v, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0));
- abort();
- }
- }
+ fprintf(stderr, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n",
+ __func__, (unsigned long) unexpected_value, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0));
+ abort();
}
/* Always inline for __builtin_return_address(0). */
return;
init_p = __rseq_pool_range_init_ptr(range, item_offset);
if (init_p)
- rseq_check_poison_item(pool, item_offset, init_p, pool->item_len, poison, true);
+ rseq_check_poison_item(pool, item_offset, init_p, pool->item_len, poison);
for (i = 0; i < pool->attr.max_nr_cpus; i++) {
char *p = __rseq_pool_range_percpu_ptr(range, i,
item_offset, pool->attr.stride);
- /*
- * When the free list is embedded in the init values
- * memory (populate none), it is visible from the init
- * values memory mapping as well as per-cpu private
- * mappings before they COW.
- *
- * When the free list is embedded in CPU 0 mapping
- * (populate all), only this CPU must skip the free list
- * nodes when checking poison.
- */
- rseq_check_poison_item(pool, item_offset, p, pool->item_len, poison,
- init_p == NULL ? (i == 0) : true);
+ rseq_check_poison_item(pool, item_offset, p, pool->item_len, poison);
}
}
*/
if (range->init) {
ret = fallocate(memfd.fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
- (off_t) range->init, pool->attr.stride);
+ ptr_to_off_t(range->init), pool->attr.stride);
if (ret)
return ret;
range->init = NULL;
size_t reserve_len;
pthread_mutex_lock(&memfd.lock);
- reserve_len = (size_t) init + init_len;
+ reserve_len = (size_t) ptr_to_off_t(init) + init_len;
if (reserve_len > memfd.reserved_size) {
if (ftruncate(memfd.fd, (off_t) reserve_len)) {
ret = -1;
range_len = pool->attr.stride * pool->attr.max_nr_cpus;
if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_ALL)
range_len += pool->attr.stride; /* init values */
+ if (pool->attr.robust_set)
+ range_len += pool->attr.stride; /* free list */
base = aligned_mmap_anonymous(pool, page_size,
range_len,
pool->attr.stride,
goto error_alloc;
if (mmap(range->init, pool->attr.stride, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, memfd.fd,
- (off_t) range->init) != (void *) range->init) {
+ ptr_to_off_t(range->init)) != (void *) range->init) {
goto error_alloc;
}
assert(pool->attr.type == MEMPOOL_TYPE_PERCPU);
size_t len = pool->attr.stride;
if (mmap(p, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED,
- memfd.fd, (off_t) range->init) != (void *) p) {
+ memfd.fd, ptr_to_off_t(range->init)) != (void *) p) {
goto error_alloc;
}
}