| 1 | // SPDX-License-Identifier: MIT |
| 2 | // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> |
| 3 | |
| 4 | #include <rseq/mempool.h> |
| 5 | #include <sys/mman.h> |
| 6 | #include <assert.h> |
| 7 | #include <string.h> |
| 8 | #include <pthread.h> |
| 9 | #include <unistd.h> |
| 10 | #include <stdlib.h> |
| 11 | #include <rseq/compiler.h> |
| 12 | #include <errno.h> |
| 13 | #include <stdint.h> |
| 14 | #include <stdbool.h> |
| 15 | #include <stdio.h> |
| 16 | |
| 17 | #ifdef HAVE_LIBNUMA |
| 18 | # include <numa.h> |
| 19 | # include <numaif.h> |
| 20 | #endif |
| 21 | |
| 22 | #include "rseq-utils.h" |
| 23 | #include "smp.h" |
| 24 | |
| 25 | /* |
| 26 | * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator. |
| 27 | * |
| 28 | * The rseq per-CPU memory allocator allows the application the request |
| 29 | * memory pools of CPU-Local memory each of containing objects of a |
| 30 | * given size (rounded to next power of 2), reserving a given virtual |
| 31 | * address size per CPU, for a given maximum number of CPUs. |
| 32 | * |
| 33 | * The per-CPU memory allocator is analogous to TLS (Thread-Local |
| 34 | * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU |
| 35 | * memory allocator provides CPU-Local Storage. |
| 36 | */ |
| 37 | |
| 38 | #define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG |
| 39 | |
| 40 | /* |
| 41 | * Smallest allocation should hold enough space for a free list pointer. |
| 42 | */ |
| 43 | #if RSEQ_BITS_PER_LONG == 64 |
| 44 | # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */ |
| 45 | #else |
| 46 | # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */ |
| 47 | #endif |
| 48 | |
| 49 | /* |
| 50 | * Skip pool index 0 to ensure allocated entries at index 0 do not match |
| 51 | * a NULL pointer. |
| 52 | */ |
| 53 | #define FIRST_POOL 1 |
| 54 | |
| 55 | #define BIT_PER_ULONG (8 * sizeof(unsigned long)) |
| 56 | |
| 57 | #define MOVE_PAGES_BATCH_SIZE 4096 |
| 58 | |
| 59 | #define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range) |
| 60 | |
| 61 | struct free_list_node; |
| 62 | |
| 63 | struct free_list_node { |
| 64 | struct free_list_node *next; |
| 65 | }; |
| 66 | |
| 67 | enum mempool_type { |
| 68 | MEMPOOL_TYPE_PERCPU = 0, /* Default */ |
| 69 | MEMPOOL_TYPE_GLOBAL = 1, |
| 70 | }; |
| 71 | |
| 72 | struct rseq_mempool_attr { |
| 73 | bool mmap_set; |
| 74 | void *(*mmap_func)(void *priv, size_t len); |
| 75 | int (*munmap_func)(void *priv, void *ptr, size_t len); |
| 76 | void *mmap_priv; |
| 77 | |
| 78 | bool robust_set; |
| 79 | |
| 80 | enum mempool_type type; |
| 81 | size_t stride; |
| 82 | int max_nr_cpus; |
| 83 | }; |
| 84 | |
| 85 | struct rseq_mempool_range; |
| 86 | |
| 87 | struct rseq_mempool_range { |
| 88 | struct rseq_mempool_range *next; |
| 89 | struct rseq_mempool *pool; /* Backward ref. to container pool. */ |
| 90 | void *header; |
| 91 | void *base; |
| 92 | size_t next_unused; |
| 93 | /* Track alloc/free. */ |
| 94 | unsigned long *alloc_bitmap; |
| 95 | }; |
| 96 | |
| 97 | struct rseq_mempool { |
| 98 | /* Linked-list of ranges. */ |
| 99 | struct rseq_mempool_range *ranges; |
| 100 | |
| 101 | size_t item_len; |
| 102 | int item_order; |
| 103 | |
| 104 | /* |
| 105 | * The free list chains freed items on the CPU 0 address range. |
| 106 | * We should rethink this decision if false sharing between |
| 107 | * malloc/free from other CPUs and data accesses from CPU 0 |
| 108 | * becomes an issue. This is a NULL-terminated singly-linked |
| 109 | * list. |
| 110 | */ |
| 111 | struct free_list_node *free_list_head; |
| 112 | |
| 113 | /* This lock protects allocation/free within the pool. */ |
| 114 | pthread_mutex_t lock; |
| 115 | |
| 116 | struct rseq_mempool_attr attr; |
| 117 | char *name; |
| 118 | }; |
| 119 | |
| 120 | /* |
| 121 | * Pool set entries are indexed by item_len rounded to the next power of |
| 122 | * 2. A pool set can contain NULL pool entries, in which case the next |
| 123 | * large enough entry will be used for allocation. |
| 124 | */ |
| 125 | struct rseq_mempool_set { |
| 126 | /* This lock protects add vs malloc/zmalloc within the pool set. */ |
| 127 | pthread_mutex_t lock; |
| 128 | struct rseq_mempool *entries[POOL_SET_NR_ENTRIES]; |
| 129 | }; |
| 130 | |
| 131 | static |
| 132 | void *__rseq_pool_percpu_ptr(struct rseq_mempool *pool, int cpu, |
| 133 | uintptr_t item_offset, size_t stride) |
| 134 | { |
| 135 | /* TODO: Implement multi-ranges support. */ |
| 136 | return pool->ranges->base + (stride * cpu) + item_offset; |
| 137 | } |
| 138 | |
| 139 | static |
| 140 | void rseq_percpu_zero_item(struct rseq_mempool *pool, uintptr_t item_offset) |
| 141 | { |
| 142 | int i; |
| 143 | |
| 144 | for (i = 0; i < pool->attr.max_nr_cpus; i++) { |
| 145 | char *p = __rseq_pool_percpu_ptr(pool, i, |
| 146 | item_offset, pool->attr.stride); |
| 147 | memset(p, 0, pool->item_len); |
| 148 | } |
| 149 | } |
| 150 | |
| 151 | //TODO: this will need to be reimplemented for ranges, |
| 152 | //which cannot use __rseq_pool_percpu_ptr. |
| 153 | #if 0 //#ifdef HAVE_LIBNUMA |
| 154 | static |
| 155 | int rseq_mempool_range_init_numa(struct rseq_mempool *pool, struct rseq_mempool_range *range, int numa_flags) |
| 156 | { |
| 157 | unsigned long nr_pages, page_len; |
| 158 | long ret; |
| 159 | int cpu; |
| 160 | |
| 161 | if (!numa_flags) |
| 162 | return 0; |
| 163 | page_len = rseq_get_page_len(); |
| 164 | nr_pages = pool->attr.stride >> rseq_get_count_order_ulong(page_len); |
| 165 | for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) { |
| 166 | |
| 167 | int status[MOVE_PAGES_BATCH_SIZE]; |
| 168 | int nodes[MOVE_PAGES_BATCH_SIZE]; |
| 169 | void *pages[MOVE_PAGES_BATCH_SIZE]; |
| 170 | |
| 171 | nodes[0] = numa_node_of_cpu(cpu); |
| 172 | for (size_t k = 1; k < RSEQ_ARRAY_SIZE(nodes); ++k) { |
| 173 | nodes[k] = nodes[0]; |
| 174 | } |
| 175 | |
| 176 | for (unsigned long page = 0; page < nr_pages;) { |
| 177 | |
| 178 | size_t max_k = RSEQ_ARRAY_SIZE(pages); |
| 179 | size_t left = nr_pages - page; |
| 180 | |
| 181 | if (left < max_k) { |
| 182 | max_k = left; |
| 183 | } |
| 184 | |
| 185 | for (size_t k = 0; k < max_k; ++k, ++page) { |
| 186 | pages[k] = __rseq_pool_percpu_ptr(pool, cpu, page * page_len); |
| 187 | status[k] = -EPERM; |
| 188 | } |
| 189 | |
| 190 | ret = move_pages(0, max_k, pages, nodes, status, numa_flags); |
| 191 | |
| 192 | if (ret < 0) |
| 193 | return ret; |
| 194 | |
| 195 | if (ret > 0) { |
| 196 | fprintf(stderr, "%lu pages were not migrated\n", ret); |
| 197 | for (size_t k = 0; k < max_k; ++k) { |
| 198 | if (status[k] < 0) |
| 199 | fprintf(stderr, |
| 200 | "Error while moving page %p to numa node %d: %u\n", |
| 201 | pages[k], nodes[k], -status[k]); |
| 202 | } |
| 203 | } |
| 204 | } |
| 205 | } |
| 206 | return 0; |
| 207 | } |
| 208 | |
| 209 | int rseq_mempool_init_numa(struct rseq_mempool *pool, int numa_flags) |
| 210 | { |
| 211 | struct rseq_mempool_range *range; |
| 212 | int ret; |
| 213 | |
| 214 | if (!numa_flags) |
| 215 | return 0; |
| 216 | for (range = pool->ranges; range; range = range->next) { |
| 217 | ret = rseq_mempool_range_init_numa(pool, range, numa_flags); |
| 218 | if (ret) |
| 219 | return ret; |
| 220 | } |
| 221 | return 0; |
| 222 | } |
| 223 | #else |
| 224 | int rseq_mempool_init_numa(struct rseq_mempool *pool __attribute__((unused)), |
| 225 | int numa_flags __attribute__((unused))) |
| 226 | { |
| 227 | return 0; |
| 228 | } |
| 229 | #endif |
| 230 | |
| 231 | static |
| 232 | void *default_mmap_func(void *priv __attribute__((unused)), size_t len) |
| 233 | { |
| 234 | void *base; |
| 235 | |
| 236 | base = mmap(NULL, len, PROT_READ | PROT_WRITE, |
| 237 | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); |
| 238 | if (base == MAP_FAILED) |
| 239 | return NULL; |
| 240 | return base; |
| 241 | } |
| 242 | |
| 243 | static |
| 244 | int default_munmap_func(void *priv __attribute__((unused)), void *ptr, size_t len) |
| 245 | { |
| 246 | return munmap(ptr, len); |
| 247 | } |
| 248 | |
| 249 | static |
| 250 | int create_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range) |
| 251 | { |
| 252 | size_t count; |
| 253 | |
| 254 | count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG; |
| 255 | |
| 256 | /* |
| 257 | * Not being able to create the validation bitmap is an error |
| 258 | * that needs to be reported. |
| 259 | */ |
| 260 | range->alloc_bitmap = calloc(count, sizeof(unsigned long)); |
| 261 | if (!range->alloc_bitmap) |
| 262 | return -1; |
| 263 | return 0; |
| 264 | } |
| 265 | |
| 266 | static |
| 267 | const char *get_pool_name(const struct rseq_mempool *pool) |
| 268 | { |
| 269 | return pool->name ? : "<anonymous>"; |
| 270 | } |
| 271 | |
| 272 | static |
| 273 | bool addr_in_pool(const struct rseq_mempool *pool, void *addr) |
| 274 | { |
| 275 | struct rseq_mempool_range *range; |
| 276 | |
| 277 | for (range = pool->ranges; range; range = range->next) { |
| 278 | if (addr >= range->base && addr < range->base + range->next_unused) |
| 279 | return true; |
| 280 | } |
| 281 | return false; |
| 282 | } |
| 283 | |
| 284 | /* Always inline for __builtin_return_address(0). */ |
| 285 | static inline __attribute__((always_inline)) |
| 286 | void check_free_list(const struct rseq_mempool *pool) |
| 287 | { |
| 288 | size_t total_item = 0, total_never_allocated = 0, total_freed = 0, |
| 289 | max_list_traversal = 0, traversal_iteration = 0; |
| 290 | struct rseq_mempool_range *range; |
| 291 | |
| 292 | if (!pool->attr.robust_set) |
| 293 | return; |
| 294 | |
| 295 | for (range = pool->ranges; range; range = range->next) { |
| 296 | total_item += pool->attr.stride >> pool->item_order; |
| 297 | total_never_allocated += (pool->attr.stride - range->next_unused) >> pool->item_order; |
| 298 | } |
| 299 | max_list_traversal = total_item - total_never_allocated; |
| 300 | |
| 301 | for (struct free_list_node *node = pool->free_list_head, *prev = NULL; |
| 302 | node; |
| 303 | prev = node, |
| 304 | node = node->next) { |
| 305 | |
| 306 | void *node_addr = node; |
| 307 | |
| 308 | if (traversal_iteration >= max_list_traversal) { |
| 309 | fprintf(stderr, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n", |
| 310 | __func__, get_pool_name(pool), pool, __builtin_return_address(0)); |
| 311 | abort(); |
| 312 | } |
| 313 | |
| 314 | /* Node is out of range. */ |
| 315 | if (!addr_in_pool(pool, node_addr)) { |
| 316 | if (prev) |
| 317 | fprintf(stderr, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n", |
| 318 | __func__, prev, node, get_pool_name(pool), pool, __builtin_return_address(0)); |
| 319 | else |
| 320 | fprintf(stderr, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n", |
| 321 | __func__, node, get_pool_name(pool), pool, __builtin_return_address(0)); |
| 322 | abort(); |
| 323 | } |
| 324 | |
| 325 | traversal_iteration++; |
| 326 | total_freed++; |
| 327 | } |
| 328 | |
| 329 | if (total_never_allocated + total_freed != total_item) { |
| 330 | fprintf(stderr, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n", |
| 331 | __func__, get_pool_name(pool), pool, total_item, total_never_allocated, total_freed, __builtin_return_address(0)); |
| 332 | abort(); |
| 333 | } |
| 334 | } |
| 335 | |
| 336 | /* Always inline for __builtin_return_address(0). */ |
| 337 | static inline __attribute__((always_inline)) |
| 338 | void destroy_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range) |
| 339 | { |
| 340 | unsigned long *bitmap = range->alloc_bitmap; |
| 341 | size_t count, total_leaks = 0; |
| 342 | |
| 343 | if (!bitmap) |
| 344 | return; |
| 345 | |
| 346 | count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG; |
| 347 | |
| 348 | /* Assert that all items in the pool were freed. */ |
| 349 | for (size_t k = 0; k < count; ++k) |
| 350 | total_leaks += rseq_hweight_ulong(bitmap[k]); |
| 351 | if (total_leaks) { |
| 352 | fprintf(stderr, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n", |
| 353 | __func__, get_pool_name(pool), pool, total_leaks, (void *) __builtin_return_address(0)); |
| 354 | abort(); |
| 355 | } |
| 356 | |
| 357 | free(bitmap); |
| 358 | } |
| 359 | |
| 360 | /* Always inline for __builtin_return_address(0). */ |
| 361 | static inline __attribute__((always_inline)) |
| 362 | int rseq_mempool_range_destroy(struct rseq_mempool *pool, |
| 363 | struct rseq_mempool_range *range) |
| 364 | { |
| 365 | destroy_alloc_bitmap(pool, range); |
| 366 | /* range is a header located one page before the aligned mapping. */ |
| 367 | return pool->attr.munmap_func(pool->attr.mmap_priv, range->header, |
| 368 | (pool->attr.stride * pool->attr.max_nr_cpus) + rseq_get_page_len()); |
| 369 | } |
| 370 | |
| 371 | /* |
| 372 | * Allocate a memory mapping aligned on @alignment, with an optional |
| 373 | * @pre_header before the mapping. |
| 374 | */ |
| 375 | static |
| 376 | void *aligned_mmap_anonymous(struct rseq_mempool *pool, |
| 377 | size_t page_size, size_t len, size_t alignment, |
| 378 | void **pre_header, size_t pre_header_len) |
| 379 | { |
| 380 | size_t minimum_page_count, page_count, extra, total_allocate = 0; |
| 381 | int page_order; |
| 382 | void *ptr; |
| 383 | |
| 384 | if (len < page_size || alignment < page_size || |
| 385 | !is_pow2(len) || !is_pow2(alignment)) { |
| 386 | errno = EINVAL; |
| 387 | return NULL; |
| 388 | } |
| 389 | page_order = rseq_get_count_order_ulong(page_size); |
| 390 | if (page_order < 0) { |
| 391 | errno = EINVAL; |
| 392 | return NULL; |
| 393 | } |
| 394 | if (pre_header_len && (pre_header_len & (page_size - 1))) { |
| 395 | errno = EINVAL; |
| 396 | return NULL; |
| 397 | } |
| 398 | |
| 399 | minimum_page_count = (pre_header_len + len) >> page_order; |
| 400 | page_count = (pre_header_len + len + alignment - page_size) >> page_order; |
| 401 | |
| 402 | assert(page_count >= minimum_page_count); |
| 403 | |
| 404 | ptr = pool->attr.mmap_func(pool->attr.mmap_priv, page_count << page_order); |
| 405 | if (!ptr) |
| 406 | goto alloc_error; |
| 407 | |
| 408 | total_allocate = page_count << page_order; |
| 409 | |
| 410 | if (!(((uintptr_t) ptr + pre_header_len) & (alignment - 1))) { |
| 411 | /* Pointer is already aligned. ptr points to pre_header. */ |
| 412 | goto out; |
| 413 | } |
| 414 | |
| 415 | /* Unmap extra before. */ |
| 416 | extra = offset_align((uintptr_t) ptr + pre_header_len, alignment); |
| 417 | assert(!(extra & (page_size - 1))); |
| 418 | if (pool->attr.munmap_func(pool->attr.mmap_priv, ptr, extra)) { |
| 419 | perror("munmap"); |
| 420 | abort(); |
| 421 | } |
| 422 | total_allocate -= extra; |
| 423 | ptr += extra; /* ptr points to pre_header */ |
| 424 | page_count -= extra >> page_order; |
| 425 | out: |
| 426 | assert(page_count >= minimum_page_count); |
| 427 | |
| 428 | if (page_count > minimum_page_count) { |
| 429 | void *extra_ptr; |
| 430 | |
| 431 | /* Unmap extra after. */ |
| 432 | extra_ptr = ptr + (minimum_page_count << page_order); |
| 433 | extra = (page_count - minimum_page_count) << page_order; |
| 434 | if (pool->attr.munmap_func(pool->attr.mmap_priv, extra_ptr, extra)) { |
| 435 | perror("munmap"); |
| 436 | abort(); |
| 437 | } |
| 438 | total_allocate -= extra; |
| 439 | } |
| 440 | |
| 441 | assert(!(((uintptr_t)ptr + pre_header_len) & (alignment - 1))); |
| 442 | assert(total_allocate == len + pre_header_len); |
| 443 | |
| 444 | alloc_error: |
| 445 | if (ptr) { |
| 446 | if (pre_header) |
| 447 | *pre_header = ptr; |
| 448 | ptr += pre_header_len; |
| 449 | } |
| 450 | return ptr; |
| 451 | } |
| 452 | |
| 453 | static |
| 454 | struct rseq_mempool_range *rseq_mempool_range_create(struct rseq_mempool *pool) |
| 455 | { |
| 456 | struct rseq_mempool_range *range; |
| 457 | unsigned long page_size; |
| 458 | void *header; |
| 459 | void *base; |
| 460 | |
| 461 | page_size = rseq_get_page_len(); |
| 462 | |
| 463 | base = aligned_mmap_anonymous(pool, page_size, |
| 464 | pool->attr.stride * pool->attr.max_nr_cpus, |
| 465 | pool->attr.stride, |
| 466 | &header, page_size); |
| 467 | if (!base) |
| 468 | return NULL; |
| 469 | range = (struct rseq_mempool_range *) (base - RANGE_HEADER_OFFSET); |
| 470 | range->pool = pool; |
| 471 | range->base = base; |
| 472 | range->header = header; |
| 473 | if (pool->attr.robust_set) { |
| 474 | if (create_alloc_bitmap(pool, range)) |
| 475 | goto error_alloc; |
| 476 | } |
| 477 | return range; |
| 478 | |
| 479 | error_alloc: |
| 480 | (void) rseq_mempool_range_destroy(pool, range); |
| 481 | return NULL; |
| 482 | } |
| 483 | |
| 484 | int rseq_mempool_destroy(struct rseq_mempool *pool) |
| 485 | { |
| 486 | struct rseq_mempool_range *range, *next_range; |
| 487 | int ret = 0; |
| 488 | |
| 489 | if (!pool) |
| 490 | return 0; |
| 491 | check_free_list(pool); |
| 492 | /* Iteration safe against removal. */ |
| 493 | for (range = pool->ranges; range && (next_range = range->next, 1); range = next_range) { |
| 494 | if (rseq_mempool_range_destroy(pool, range)) |
| 495 | goto end; |
| 496 | /* Update list head to keep list coherent in case of partial failure. */ |
| 497 | pool->ranges = next_range; |
| 498 | } |
| 499 | pthread_mutex_destroy(&pool->lock); |
| 500 | free(pool->name); |
| 501 | memset(pool, 0, sizeof(*pool)); |
| 502 | end: |
| 503 | return ret; |
| 504 | } |
| 505 | |
| 506 | struct rseq_mempool *rseq_mempool_create(const char *pool_name, |
| 507 | size_t item_len, const struct rseq_mempool_attr *_attr) |
| 508 | { |
| 509 | struct rseq_mempool *pool; |
| 510 | struct rseq_mempool_attr attr = {}; |
| 511 | int order; |
| 512 | |
| 513 | /* Make sure each item is large enough to contain free list pointers. */ |
| 514 | if (item_len < sizeof(void *)) |
| 515 | item_len = sizeof(void *); |
| 516 | |
| 517 | /* Align item_len on next power of two. */ |
| 518 | order = rseq_get_count_order_ulong(item_len); |
| 519 | if (order < 0) { |
| 520 | errno = EINVAL; |
| 521 | return NULL; |
| 522 | } |
| 523 | item_len = 1UL << order; |
| 524 | |
| 525 | if (_attr) |
| 526 | memcpy(&attr, _attr, sizeof(attr)); |
| 527 | if (!attr.mmap_set) { |
| 528 | attr.mmap_func = default_mmap_func; |
| 529 | attr.munmap_func = default_munmap_func; |
| 530 | attr.mmap_priv = NULL; |
| 531 | } |
| 532 | |
| 533 | switch (attr.type) { |
| 534 | case MEMPOOL_TYPE_PERCPU: |
| 535 | if (attr.max_nr_cpus < 0) { |
| 536 | errno = EINVAL; |
| 537 | return NULL; |
| 538 | } |
| 539 | if (attr.max_nr_cpus == 0) { |
| 540 | /* Auto-detect */ |
| 541 | attr.max_nr_cpus = get_possible_cpus_array_len(); |
| 542 | if (attr.max_nr_cpus == 0) { |
| 543 | errno = EINVAL; |
| 544 | return NULL; |
| 545 | } |
| 546 | } |
| 547 | break; |
| 548 | case MEMPOOL_TYPE_GLOBAL: |
| 549 | break; |
| 550 | } |
| 551 | if (!attr.stride) |
| 552 | attr.stride = RSEQ_MEMPOOL_STRIDE; /* Use default */ |
| 553 | if (item_len > attr.stride || attr.stride < (size_t) rseq_get_page_len() || |
| 554 | !is_pow2(attr.stride)) { |
| 555 | errno = EINVAL; |
| 556 | return NULL; |
| 557 | } |
| 558 | |
| 559 | pool = calloc(1, sizeof(struct rseq_mempool)); |
| 560 | if (!pool) |
| 561 | return NULL; |
| 562 | |
| 563 | memcpy(&pool->attr, &attr, sizeof(attr)); |
| 564 | pthread_mutex_init(&pool->lock, NULL); |
| 565 | pool->item_len = item_len; |
| 566 | pool->item_order = order; |
| 567 | |
| 568 | //TODO: implement multi-range support. |
| 569 | pool->ranges = rseq_mempool_range_create(pool); |
| 570 | if (!pool->ranges) |
| 571 | goto error_alloc; |
| 572 | |
| 573 | if (pool_name) { |
| 574 | pool->name = strdup(pool_name); |
| 575 | if (!pool->name) |
| 576 | goto error_alloc; |
| 577 | } |
| 578 | return pool; |
| 579 | |
| 580 | error_alloc: |
| 581 | rseq_mempool_destroy(pool); |
| 582 | errno = ENOMEM; |
| 583 | return NULL; |
| 584 | } |
| 585 | |
| 586 | /* Always inline for __builtin_return_address(0). */ |
| 587 | static inline __attribute__((always_inline)) |
| 588 | void set_alloc_slot(struct rseq_mempool *pool, size_t item_offset) |
| 589 | { |
| 590 | unsigned long *bitmap = pool->ranges->alloc_bitmap; |
| 591 | size_t item_index = item_offset >> pool->item_order; |
| 592 | unsigned long mask; |
| 593 | size_t k; |
| 594 | |
| 595 | if (!bitmap) |
| 596 | return; |
| 597 | |
| 598 | k = item_index / BIT_PER_ULONG; |
| 599 | mask = 1ULL << (item_index % BIT_PER_ULONG); |
| 600 | |
| 601 | /* Print error if bit is already set. */ |
| 602 | if (bitmap[k] & mask) { |
| 603 | fprintf(stderr, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n", |
| 604 | __func__, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0)); |
| 605 | abort(); |
| 606 | } |
| 607 | bitmap[k] |= mask; |
| 608 | } |
| 609 | |
| 610 | static |
| 611 | void __rseq_percpu *__rseq_percpu_malloc(struct rseq_mempool *pool, bool zeroed) |
| 612 | { |
| 613 | struct free_list_node *node; |
| 614 | uintptr_t item_offset; |
| 615 | void __rseq_percpu *addr; |
| 616 | |
| 617 | pthread_mutex_lock(&pool->lock); |
| 618 | /* Get first entry from free list. */ |
| 619 | node = pool->free_list_head; |
| 620 | if (node != NULL) { |
| 621 | /* Remove node from free list (update head). */ |
| 622 | pool->free_list_head = node->next; |
| 623 | item_offset = (uintptr_t) ((void *) node - pool->ranges->base); |
| 624 | addr = (void __rseq_percpu *) (pool->ranges->base + item_offset); |
| 625 | goto end; |
| 626 | } |
| 627 | if (pool->ranges->next_unused + pool->item_len > pool->attr.stride) { |
| 628 | errno = ENOMEM; |
| 629 | addr = NULL; |
| 630 | goto end; |
| 631 | } |
| 632 | item_offset = pool->ranges->next_unused; |
| 633 | addr = (void __rseq_percpu *) (pool->ranges->base + item_offset); |
| 634 | pool->ranges->next_unused += pool->item_len; |
| 635 | end: |
| 636 | if (addr) |
| 637 | set_alloc_slot(pool, item_offset); |
| 638 | pthread_mutex_unlock(&pool->lock); |
| 639 | if (zeroed && addr) |
| 640 | rseq_percpu_zero_item(pool, item_offset); |
| 641 | return addr; |
| 642 | } |
| 643 | |
| 644 | void __rseq_percpu *rseq_mempool_percpu_malloc(struct rseq_mempool *pool) |
| 645 | { |
| 646 | return __rseq_percpu_malloc(pool, false); |
| 647 | } |
| 648 | |
| 649 | void __rseq_percpu *rseq_mempool_percpu_zmalloc(struct rseq_mempool *pool) |
| 650 | { |
| 651 | return __rseq_percpu_malloc(pool, true); |
| 652 | } |
| 653 | |
| 654 | /* Always inline for __builtin_return_address(0). */ |
| 655 | static inline __attribute__((always_inline)) |
| 656 | void clear_alloc_slot(struct rseq_mempool *pool, size_t item_offset) |
| 657 | { |
| 658 | unsigned long *bitmap = pool->ranges->alloc_bitmap; |
| 659 | size_t item_index = item_offset >> pool->item_order; |
| 660 | unsigned long mask; |
| 661 | size_t k; |
| 662 | |
| 663 | if (!bitmap) |
| 664 | return; |
| 665 | |
| 666 | k = item_index / BIT_PER_ULONG; |
| 667 | mask = 1ULL << (item_index % BIT_PER_ULONG); |
| 668 | |
| 669 | /* Print error if bit is not set. */ |
| 670 | if (!(bitmap[k] & mask)) { |
| 671 | fprintf(stderr, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n", |
| 672 | __func__, get_pool_name(pool), pool, item_offset, |
| 673 | (void *) __builtin_return_address(0)); |
| 674 | abort(); |
| 675 | } |
| 676 | bitmap[k] &= ~mask; |
| 677 | } |
| 678 | |
| 679 | void librseq_mempool_percpu_free(void __rseq_percpu *_ptr, size_t stride) |
| 680 | { |
| 681 | uintptr_t ptr = (uintptr_t) _ptr; |
| 682 | void *range_base = (void *) (ptr & (~(stride - 1))); |
| 683 | struct rseq_mempool_range *range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET); |
| 684 | struct rseq_mempool *pool = range->pool; |
| 685 | uintptr_t item_offset = ptr & (stride - 1); |
| 686 | struct free_list_node *head, *item; |
| 687 | |
| 688 | pthread_mutex_lock(&pool->lock); |
| 689 | clear_alloc_slot(pool, item_offset); |
| 690 | /* Add ptr to head of free list */ |
| 691 | head = pool->free_list_head; |
| 692 | /* Free-list is in CPU 0 range. */ |
| 693 | item = (struct free_list_node *) ptr; |
| 694 | item->next = head; |
| 695 | pool->free_list_head = item; |
| 696 | pthread_mutex_unlock(&pool->lock); |
| 697 | } |
| 698 | |
| 699 | struct rseq_mempool_set *rseq_mempool_set_create(void) |
| 700 | { |
| 701 | struct rseq_mempool_set *pool_set; |
| 702 | |
| 703 | pool_set = calloc(1, sizeof(struct rseq_mempool_set)); |
| 704 | if (!pool_set) |
| 705 | return NULL; |
| 706 | pthread_mutex_init(&pool_set->lock, NULL); |
| 707 | return pool_set; |
| 708 | } |
| 709 | |
| 710 | int rseq_mempool_set_destroy(struct rseq_mempool_set *pool_set) |
| 711 | { |
| 712 | int order, ret; |
| 713 | |
| 714 | for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) { |
| 715 | struct rseq_mempool *pool = pool_set->entries[order]; |
| 716 | |
| 717 | if (!pool) |
| 718 | continue; |
| 719 | ret = rseq_mempool_destroy(pool); |
| 720 | if (ret) |
| 721 | return ret; |
| 722 | pool_set->entries[order] = NULL; |
| 723 | } |
| 724 | pthread_mutex_destroy(&pool_set->lock); |
| 725 | free(pool_set); |
| 726 | return 0; |
| 727 | } |
| 728 | |
| 729 | /* Ownership of pool is handed over to pool set on success. */ |
| 730 | int rseq_mempool_set_add_pool(struct rseq_mempool_set *pool_set, struct rseq_mempool *pool) |
| 731 | { |
| 732 | size_t item_order = pool->item_order; |
| 733 | int ret = 0; |
| 734 | |
| 735 | pthread_mutex_lock(&pool_set->lock); |
| 736 | if (pool_set->entries[item_order]) { |
| 737 | errno = EBUSY; |
| 738 | ret = -1; |
| 739 | goto end; |
| 740 | } |
| 741 | pool_set->entries[pool->item_order] = pool; |
| 742 | end: |
| 743 | pthread_mutex_unlock(&pool_set->lock); |
| 744 | return ret; |
| 745 | } |
| 746 | |
| 747 | static |
| 748 | void __rseq_percpu *__rseq_mempool_set_malloc(struct rseq_mempool_set *pool_set, size_t len, bool zeroed) |
| 749 | { |
| 750 | int order, min_order = POOL_SET_MIN_ENTRY; |
| 751 | struct rseq_mempool *pool; |
| 752 | void __rseq_percpu *addr; |
| 753 | |
| 754 | order = rseq_get_count_order_ulong(len); |
| 755 | if (order > POOL_SET_MIN_ENTRY) |
| 756 | min_order = order; |
| 757 | again: |
| 758 | pthread_mutex_lock(&pool_set->lock); |
| 759 | /* First smallest present pool where @len fits. */ |
| 760 | for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) { |
| 761 | pool = pool_set->entries[order]; |
| 762 | |
| 763 | if (!pool) |
| 764 | continue; |
| 765 | if (pool->item_len >= len) |
| 766 | goto found; |
| 767 | } |
| 768 | pool = NULL; |
| 769 | found: |
| 770 | pthread_mutex_unlock(&pool_set->lock); |
| 771 | if (pool) { |
| 772 | addr = __rseq_percpu_malloc(pool, zeroed); |
| 773 | if (addr == NULL && errno == ENOMEM) { |
| 774 | /* |
| 775 | * If the allocation failed, try again with a |
| 776 | * larger pool. |
| 777 | */ |
| 778 | min_order = order + 1; |
| 779 | goto again; |
| 780 | } |
| 781 | } else { |
| 782 | /* Not found. */ |
| 783 | errno = ENOMEM; |
| 784 | addr = NULL; |
| 785 | } |
| 786 | return addr; |
| 787 | } |
| 788 | |
| 789 | void __rseq_percpu *rseq_mempool_set_percpu_malloc(struct rseq_mempool_set *pool_set, size_t len) |
| 790 | { |
| 791 | return __rseq_mempool_set_malloc(pool_set, len, false); |
| 792 | } |
| 793 | |
| 794 | void __rseq_percpu *rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set *pool_set, size_t len) |
| 795 | { |
| 796 | return __rseq_mempool_set_malloc(pool_set, len, true); |
| 797 | } |
| 798 | |
| 799 | struct rseq_mempool_attr *rseq_mempool_attr_create(void) |
| 800 | { |
| 801 | return calloc(1, sizeof(struct rseq_mempool_attr)); |
| 802 | } |
| 803 | |
| 804 | void rseq_mempool_attr_destroy(struct rseq_mempool_attr *attr) |
| 805 | { |
| 806 | free(attr); |
| 807 | } |
| 808 | |
| 809 | int rseq_mempool_attr_set_mmap(struct rseq_mempool_attr *attr, |
| 810 | void *(*mmap_func)(void *priv, size_t len), |
| 811 | int (*munmap_func)(void *priv, void *ptr, size_t len), |
| 812 | void *mmap_priv) |
| 813 | { |
| 814 | if (!attr) { |
| 815 | errno = EINVAL; |
| 816 | return -1; |
| 817 | } |
| 818 | attr->mmap_set = true; |
| 819 | attr->mmap_func = mmap_func; |
| 820 | attr->munmap_func = munmap_func; |
| 821 | attr->mmap_priv = mmap_priv; |
| 822 | return 0; |
| 823 | } |
| 824 | |
| 825 | int rseq_mempool_attr_set_robust(struct rseq_mempool_attr *attr) |
| 826 | { |
| 827 | if (!attr) { |
| 828 | errno = EINVAL; |
| 829 | return -1; |
| 830 | } |
| 831 | attr->robust_set = true; |
| 832 | return 0; |
| 833 | } |
| 834 | |
| 835 | int rseq_mempool_attr_set_percpu(struct rseq_mempool_attr *attr, |
| 836 | size_t stride, int max_nr_cpus) |
| 837 | { |
| 838 | if (!attr) { |
| 839 | errno = EINVAL; |
| 840 | return -1; |
| 841 | } |
| 842 | attr->type = MEMPOOL_TYPE_PERCPU; |
| 843 | attr->stride = stride; |
| 844 | attr->max_nr_cpus = max_nr_cpus; |
| 845 | return 0; |
| 846 | } |
| 847 | |
| 848 | int rseq_mempool_attr_set_global(struct rseq_mempool_attr *attr, |
| 849 | size_t stride) |
| 850 | { |
| 851 | if (!attr) { |
| 852 | errno = EINVAL; |
| 853 | return -1; |
| 854 | } |
| 855 | attr->type = MEMPOOL_TYPE_GLOBAL; |
| 856 | attr->stride = stride; |
| 857 | attr->max_nr_cpus = 1; |
| 858 | return 0; |
| 859 | } |