| 1 | // SPDX-License-Identifier: MIT |
| 2 | // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> |
| 3 | |
| 4 | #include <rseq/percpu-alloc.h> |
| 5 | #include <sys/mman.h> |
| 6 | #include <assert.h> |
| 7 | #include <string.h> |
| 8 | #include <pthread.h> |
| 9 | #include <unistd.h> |
| 10 | #include <stdlib.h> |
| 11 | #include <rseq/compiler.h> |
| 12 | #include <errno.h> |
| 13 | #include <stdint.h> |
| 14 | #include <stdbool.h> |
| 15 | #include <stdio.h> |
| 16 | |
| 17 | #ifdef HAVE_LIBNUMA |
| 18 | # include <numa.h> |
| 19 | # include <numaif.h> |
| 20 | #endif |
| 21 | |
| 22 | /* |
| 23 | * rseq-percpu-alloc.c: rseq per-cpu memory allocator. |
| 24 | * |
| 25 | * The rseq per-cpu memory allocator allows the application the request |
| 26 | * memory pools of per-cpu memory each of a given virtual address size |
| 27 | * per-cpu, for a given maximum number of CPUs. |
| 28 | */ |
| 29 | |
| 30 | /* Use lowest bits of per-cpu addresses to index the pool. */ |
| 31 | #if RSEQ_BITS_PER_LONG == 64 |
| 32 | # define OFFSET_SHIFT 16 |
| 33 | #else |
| 34 | # define OFFSET_SHIFT 8 |
| 35 | #endif |
| 36 | #define MAX_NR_POOLS (1U << OFFSET_SHIFT) |
| 37 | #define POOL_MASK (MAX_NR_POOLS - 1) |
| 38 | |
| 39 | #define POOL_SET_NR_ENTRIES (RSEQ_BITS_PER_LONG - OFFSET_SHIFT) |
| 40 | #define POOL_SET_MAX_ALLOC_LEN (1U << POOL_SET_NR_ENTRIES) |
| 41 | |
| 42 | #if RSEQ_BITS_PER_LONG == 64 |
| 43 | # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */ |
| 44 | #else |
| 45 | # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */ |
| 46 | #endif |
| 47 | |
| 48 | #define DEFAULT_PAGE_SIZE 4096 |
| 49 | |
| 50 | struct free_list_node; |
| 51 | |
| 52 | struct free_list_node { |
| 53 | struct free_list_node *next; |
| 54 | }; |
| 55 | |
| 56 | /* This lock protects pool create/destroy. */ |
| 57 | static pthread_mutex_t pool_lock = PTHREAD_MUTEX_INITIALIZER; |
| 58 | |
| 59 | struct rseq_percpu_pool { |
| 60 | void *base; |
| 61 | unsigned int index; |
| 62 | size_t item_len; |
| 63 | size_t percpu_len; |
| 64 | int item_order; |
| 65 | int max_nr_cpus; |
| 66 | |
| 67 | /* |
| 68 | * The free list chains freed items on the cpu 0 address range. |
| 69 | * We should rethink this decision if false sharing between |
| 70 | * malloc/free from other cpus and data accesses from cpu 0 |
| 71 | * becomes an issue. This is a NULL-terminated singly-linked |
| 72 | * list. |
| 73 | */ |
| 74 | struct free_list_node *free_list_head; |
| 75 | size_t next_unused; |
| 76 | /* This lock protects allocation/free within the pool. */ |
| 77 | pthread_mutex_t lock; |
| 78 | }; |
| 79 | |
| 80 | //TODO: the array of pools should grow dynamically on create. |
| 81 | static struct rseq_percpu_pool rseq_percpu_pool[MAX_NR_POOLS]; |
| 82 | |
| 83 | /* |
| 84 | * Pool set entries are indexed by item_len rounded to the next power of |
| 85 | * 2. A pool set can contain NULL pool entries, in which case the next |
| 86 | * large enough entry will be used for allocation. |
| 87 | */ |
| 88 | struct rseq_percpu_pool_set { |
| 89 | /* This lock protects add vs malloc/zmalloc within the pool set. */ |
| 90 | pthread_mutex_t lock; |
| 91 | struct rseq_percpu_pool *entries[POOL_SET_NR_ENTRIES]; |
| 92 | }; |
| 93 | |
| 94 | #define __rseq_align_mask(v, mask) (((v) + (mask)) & ~(mask)) |
| 95 | #define rseq_align(v, align) __rseq_align_mask(v, (__typeof__(v)) (align) - 1) |
| 96 | |
| 97 | static __attribute__((unused)) |
| 98 | unsigned int fls_u64(uint64_t x) |
| 99 | { |
| 100 | unsigned int r = 64; |
| 101 | |
| 102 | if (!x) |
| 103 | return 0; |
| 104 | |
| 105 | if (!(x & 0xFFFFFFFF00000000ULL)) { |
| 106 | x <<= 32; |
| 107 | r -= 32; |
| 108 | } |
| 109 | if (!(x & 0xFFFF000000000000ULL)) { |
| 110 | x <<= 16; |
| 111 | r -= 16; |
| 112 | } |
| 113 | if (!(x & 0xFF00000000000000ULL)) { |
| 114 | x <<= 8; |
| 115 | r -= 8; |
| 116 | } |
| 117 | if (!(x & 0xF000000000000000ULL)) { |
| 118 | x <<= 4; |
| 119 | r -= 4; |
| 120 | } |
| 121 | if (!(x & 0xC000000000000000ULL)) { |
| 122 | x <<= 2; |
| 123 | r -= 2; |
| 124 | } |
| 125 | if (!(x & 0x8000000000000000ULL)) { |
| 126 | x <<= 1; |
| 127 | r -= 1; |
| 128 | } |
| 129 | return r; |
| 130 | } |
| 131 | |
| 132 | static __attribute__((unused)) |
| 133 | unsigned int fls_u32(uint32_t x) |
| 134 | { |
| 135 | unsigned int r = 32; |
| 136 | |
| 137 | if (!x) |
| 138 | return 0; |
| 139 | if (!(x & 0xFFFF0000U)) { |
| 140 | x <<= 16; |
| 141 | r -= 16; |
| 142 | } |
| 143 | if (!(x & 0xFF000000U)) { |
| 144 | x <<= 8; |
| 145 | r -= 8; |
| 146 | } |
| 147 | if (!(x & 0xF0000000U)) { |
| 148 | x <<= 4; |
| 149 | r -= 4; |
| 150 | } |
| 151 | if (!(x & 0xC0000000U)) { |
| 152 | x <<= 2; |
| 153 | r -= 2; |
| 154 | } |
| 155 | if (!(x & 0x80000000U)) { |
| 156 | x <<= 1; |
| 157 | r -= 1; |
| 158 | } |
| 159 | return r; |
| 160 | } |
| 161 | |
| 162 | static |
| 163 | unsigned int fls_ulong(unsigned long x) |
| 164 | { |
| 165 | #if RSEQ_BITS_PER_LONG == 32 |
| 166 | return fls_u32(x); |
| 167 | #else |
| 168 | return fls_u64(x); |
| 169 | #endif |
| 170 | } |
| 171 | |
| 172 | /* |
| 173 | * Return the minimum order for which x <= (1UL << order). |
| 174 | * Return -1 if x is 0. |
| 175 | */ |
| 176 | static |
| 177 | int get_count_order_ulong(unsigned long x) |
| 178 | { |
| 179 | if (!x) |
| 180 | return -1; |
| 181 | |
| 182 | return fls_ulong(x - 1); |
| 183 | } |
| 184 | |
| 185 | static |
| 186 | long rseq_get_page_len(void) |
| 187 | { |
| 188 | long page_len = sysconf(_SC_PAGE_SIZE); |
| 189 | |
| 190 | if (page_len < 0) |
| 191 | page_len = DEFAULT_PAGE_SIZE; |
| 192 | return page_len; |
| 193 | } |
| 194 | |
| 195 | static |
| 196 | void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool *pool, int cpu, uintptr_t item_offset) |
| 197 | { |
| 198 | return pool->base + (pool->percpu_len * cpu) + item_offset; |
| 199 | } |
| 200 | |
| 201 | void *__rseq_percpu_ptr(void *_ptr, int cpu) |
| 202 | { |
| 203 | uintptr_t ptr = (uintptr_t) _ptr; |
| 204 | uintptr_t item_offset = ptr >> OFFSET_SHIFT; |
| 205 | uintptr_t pool_index = ptr & POOL_MASK; |
| 206 | struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index]; |
| 207 | |
| 208 | assert(cpu >= 0); |
| 209 | return __rseq_pool_percpu_ptr(pool, cpu, item_offset); |
| 210 | } |
| 211 | |
| 212 | static |
| 213 | void rseq_percpu_zero_item(struct rseq_percpu_pool *pool, uintptr_t item_offset) |
| 214 | { |
| 215 | int i; |
| 216 | |
| 217 | for (i = 0; i < pool->max_nr_cpus; i++) { |
| 218 | char *p = __rseq_pool_percpu_ptr(pool, i, item_offset); |
| 219 | memset(p, 0, pool->item_len); |
| 220 | } |
| 221 | } |
| 222 | |
| 223 | #ifdef HAVE_LIBNUMA |
| 224 | static |
| 225 | void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool, |
| 226 | int numa_flags) |
| 227 | { |
| 228 | unsigned long nr_pages, page; |
| 229 | long ret, page_len; |
| 230 | int cpu; |
| 231 | |
| 232 | if (!numa_flags) |
| 233 | return; |
| 234 | page_len = rseq_get_page_len(); |
| 235 | nr_pages = pool->percpu_len >> get_count_order_ulong(page_len); |
| 236 | for (cpu = 0; cpu < pool->max_nr_cpus; cpu++) { |
| 237 | int node = numa_node_of_cpu(cpu); |
| 238 | |
| 239 | /* TODO: batch move_pages() call with an array of pages. */ |
| 240 | for (page = 0; page < nr_pages; page++) { |
| 241 | void *pageptr = __rseq_pool_percpu_ptr(pool, cpu, page * page_len); |
| 242 | int status = -EPERM; |
| 243 | |
| 244 | ret = move_pages(0, 1, &pageptr, &node, &status, numa_flags); |
| 245 | if (ret) { |
| 246 | perror("move_pages"); |
| 247 | abort(); |
| 248 | } |
| 249 | } |
| 250 | } |
| 251 | } |
| 252 | #else |
| 253 | static |
| 254 | void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool __attribute__((unused)), |
| 255 | int numa_flags __attribute__((unused))) |
| 256 | { |
| 257 | } |
| 258 | #endif |
| 259 | |
| 260 | /* |
| 261 | * Expected numa_flags: |
| 262 | * 0: do not move pages to specific numa nodes (use for e.g. mm_cid indexing). |
| 263 | * MPOL_MF_MOVE: move process-private pages to cpu-specific numa nodes. |
| 264 | * MPOL_MF_MOVE_ALL: move shared pages to cpu-specific numa nodes (requires CAP_SYS_NICE). |
| 265 | */ |
| 266 | struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len, |
| 267 | size_t percpu_len, int max_nr_cpus, |
| 268 | int mmap_prot, int mmap_flags, int mmap_fd, |
| 269 | off_t mmap_offset, int numa_flags) |
| 270 | { |
| 271 | struct rseq_percpu_pool *pool; |
| 272 | void *base; |
| 273 | unsigned int i; |
| 274 | int order; |
| 275 | |
| 276 | /* Make sure each item is large enough to contain free list pointers. */ |
| 277 | if (item_len < sizeof(void *)) |
| 278 | item_len = sizeof(void *); |
| 279 | |
| 280 | /* Align item_len on next power of two. */ |
| 281 | order = get_count_order_ulong(item_len); |
| 282 | if (order < 0) { |
| 283 | errno = EINVAL; |
| 284 | return NULL; |
| 285 | } |
| 286 | item_len = 1UL << order; |
| 287 | |
| 288 | /* Align percpu_len on page size. */ |
| 289 | percpu_len = rseq_align(percpu_len, rseq_get_page_len()); |
| 290 | |
| 291 | if (max_nr_cpus < 0 || item_len > percpu_len || |
| 292 | percpu_len > (UINTPTR_MAX >> OFFSET_SHIFT)) { |
| 293 | errno = EINVAL; |
| 294 | return NULL; |
| 295 | } |
| 296 | |
| 297 | pthread_mutex_lock(&pool_lock); |
| 298 | /* Linear scan in array of pools to find empty spot. */ |
| 299 | for (i = 0; i < MAX_NR_POOLS; i++) { |
| 300 | pool = &rseq_percpu_pool[i]; |
| 301 | if (!pool->base) |
| 302 | goto found_empty; |
| 303 | } |
| 304 | errno = ENOMEM; |
| 305 | pool = NULL; |
| 306 | goto end; |
| 307 | |
| 308 | found_empty: |
| 309 | base = mmap(NULL, percpu_len * max_nr_cpus, mmap_prot, |
| 310 | mmap_flags, mmap_fd, mmap_offset); |
| 311 | if (base == MAP_FAILED) { |
| 312 | pool = NULL; |
| 313 | goto end; |
| 314 | } |
| 315 | rseq_percpu_pool_init_numa(pool, numa_flags); |
| 316 | pthread_mutex_init(&pool->lock, NULL); |
| 317 | pool->base = base; |
| 318 | pool->percpu_len = percpu_len; |
| 319 | pool->max_nr_cpus = max_nr_cpus; |
| 320 | pool->index = i; |
| 321 | pool->item_len = item_len; |
| 322 | pool->item_order = order; |
| 323 | end: |
| 324 | pthread_mutex_unlock(&pool_lock); |
| 325 | return pool; |
| 326 | } |
| 327 | |
| 328 | int rseq_percpu_pool_destroy(struct rseq_percpu_pool *pool) |
| 329 | { |
| 330 | int ret; |
| 331 | |
| 332 | pthread_mutex_lock(&pool_lock); |
| 333 | if (!pool->base) { |
| 334 | errno = ENOENT; |
| 335 | ret = -1; |
| 336 | goto end; |
| 337 | } |
| 338 | ret = munmap(pool->base, pool->percpu_len * pool->max_nr_cpus); |
| 339 | if (ret) |
| 340 | goto end; |
| 341 | pthread_mutex_destroy(&pool->lock); |
| 342 | memset(pool, 0, sizeof(*pool)); |
| 343 | end: |
| 344 | pthread_mutex_unlock(&pool_lock); |
| 345 | return 0; |
| 346 | } |
| 347 | |
| 348 | static |
| 349 | void *__rseq_percpu_malloc(struct rseq_percpu_pool *pool, bool zeroed) |
| 350 | { |
| 351 | struct free_list_node *node; |
| 352 | uintptr_t item_offset; |
| 353 | void *addr; |
| 354 | |
| 355 | pthread_mutex_lock(&pool->lock); |
| 356 | /* Get first entry from free list. */ |
| 357 | node = pool->free_list_head; |
| 358 | if (node != NULL) { |
| 359 | /* Remove node from free list (update head). */ |
| 360 | pool->free_list_head = node->next; |
| 361 | item_offset = (uintptr_t) ((void *) node - pool->base); |
| 362 | addr = (void *) ((item_offset << OFFSET_SHIFT) | pool->index); |
| 363 | goto end; |
| 364 | } |
| 365 | if (pool->next_unused + pool->item_len > pool->percpu_len) { |
| 366 | addr = NULL; |
| 367 | goto end; |
| 368 | } |
| 369 | item_offset = pool->next_unused; |
| 370 | addr = (void *) ((item_offset << OFFSET_SHIFT) | pool->index); |
| 371 | pool->next_unused += pool->item_len; |
| 372 | end: |
| 373 | pthread_mutex_unlock(&pool->lock); |
| 374 | if (zeroed && addr) |
| 375 | rseq_percpu_zero_item(pool, item_offset); |
| 376 | return addr; |
| 377 | } |
| 378 | |
| 379 | void *rseq_percpu_malloc(struct rseq_percpu_pool *pool) |
| 380 | { |
| 381 | return __rseq_percpu_malloc(pool, false); |
| 382 | } |
| 383 | |
| 384 | void *rseq_percpu_zmalloc(struct rseq_percpu_pool *pool) |
| 385 | { |
| 386 | return __rseq_percpu_malloc(pool, true); |
| 387 | } |
| 388 | |
| 389 | void rseq_percpu_free(void *_ptr) |
| 390 | { |
| 391 | uintptr_t ptr = (uintptr_t) _ptr; |
| 392 | uintptr_t item_offset = ptr >> OFFSET_SHIFT; |
| 393 | uintptr_t pool_index = ptr & POOL_MASK; |
| 394 | struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index]; |
| 395 | struct free_list_node *head, *item; |
| 396 | |
| 397 | pthread_mutex_lock(&pool->lock); |
| 398 | /* Add ptr to head of free list */ |
| 399 | head = pool->free_list_head; |
| 400 | /* Free-list is in cpu 0 range. */ |
| 401 | item = (struct free_list_node *)__rseq_pool_percpu_ptr(pool, 0, item_offset); |
| 402 | item->next = head; |
| 403 | pool->free_list_head = item; |
| 404 | pthread_mutex_unlock(&pool->lock); |
| 405 | } |
| 406 | |
| 407 | struct rseq_percpu_pool_set *rseq_percpu_pool_set_create(void) |
| 408 | { |
| 409 | struct rseq_percpu_pool_set *pool_set; |
| 410 | |
| 411 | pool_set = calloc(1, sizeof(struct rseq_percpu_pool_set)); |
| 412 | if (!pool_set) |
| 413 | return NULL; |
| 414 | pthread_mutex_init(&pool_set->lock, NULL); |
| 415 | return pool_set; |
| 416 | } |
| 417 | |
| 418 | int rseq_percpu_pool_set_destroy(struct rseq_percpu_pool_set *pool_set) |
| 419 | { |
| 420 | int order, ret; |
| 421 | |
| 422 | for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) { |
| 423 | struct rseq_percpu_pool *pool = pool_set->entries[order]; |
| 424 | |
| 425 | if (!pool) |
| 426 | continue; |
| 427 | ret = rseq_percpu_pool_destroy(pool); |
| 428 | if (ret) |
| 429 | return ret; |
| 430 | pool_set->entries[order] = NULL; |
| 431 | } |
| 432 | pthread_mutex_destroy(&pool_set->lock); |
| 433 | free(pool_set); |
| 434 | return 0; |
| 435 | } |
| 436 | |
| 437 | /* Ownership of pool is handed over to pool set on success. */ |
| 438 | int rseq_percpu_pool_set_add_pool(struct rseq_percpu_pool_set *pool_set, struct rseq_percpu_pool *pool) |
| 439 | { |
| 440 | size_t item_order = pool->item_order; |
| 441 | int ret = 0; |
| 442 | |
| 443 | pthread_mutex_lock(&pool_set->lock); |
| 444 | if (pool_set->entries[item_order]) { |
| 445 | errno = EBUSY; |
| 446 | ret = -1; |
| 447 | goto end; |
| 448 | } |
| 449 | pool_set->entries[pool->item_order] = pool; |
| 450 | end: |
| 451 | pthread_mutex_unlock(&pool_set->lock); |
| 452 | return ret; |
| 453 | } |
| 454 | |
| 455 | static |
| 456 | void *__rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len, bool zeroed) |
| 457 | { |
| 458 | int order, min_order = POOL_SET_MIN_ENTRY; |
| 459 | struct rseq_percpu_pool *pool; |
| 460 | void *addr; |
| 461 | |
| 462 | again: |
| 463 | pthread_mutex_lock(&pool_set->lock); |
| 464 | /* First smallest present pool where @len fits. */ |
| 465 | for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) { |
| 466 | pool = pool_set->entries[order]; |
| 467 | |
| 468 | if (!pool) |
| 469 | continue; |
| 470 | if (pool->item_len >= len) |
| 471 | goto found; |
| 472 | } |
| 473 | pool = NULL; |
| 474 | found: |
| 475 | pthread_mutex_unlock(&pool_set->lock); |
| 476 | if (pool) { |
| 477 | addr = __rseq_percpu_malloc(pool, zeroed); |
| 478 | if (addr == NULL && errno == ENOMEM) { |
| 479 | /* |
| 480 | * If the allocation failed, try again with a |
| 481 | * larger pool. |
| 482 | */ |
| 483 | min_order = order + 1; |
| 484 | goto again; |
| 485 | } |
| 486 | } else { |
| 487 | /* Not found. */ |
| 488 | errno = ENOMEM; |
| 489 | addr = NULL; |
| 490 | } |
| 491 | return addr; |
| 492 | } |
| 493 | |
| 494 | void *rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len) |
| 495 | { |
| 496 | return __rseq_percpu_pool_set_malloc(pool_set, len, false); |
| 497 | } |
| 498 | |
| 499 | void *rseq_percpu_pool_set_zmalloc(struct rseq_percpu_pool_set *pool_set, size_t len) |
| 500 | { |
| 501 | return __rseq_percpu_pool_set_malloc(pool_set, len, true); |
| 502 | } |