rseq percpu: Use high bits for pool index
[librseq.git] / src / rseq-percpu-alloc.c
CommitLineData
ef6695f1
MD
1// SPDX-License-Identifier: MIT
2// SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3
4#include <rseq/percpu-alloc.h>
5#include <sys/mman.h>
6#include <assert.h>
7#include <string.h>
8#include <pthread.h>
9#include <unistd.h>
10#include <stdlib.h>
11#include <rseq/compiler.h>
12#include <errno.h>
13#include <stdint.h>
14#include <stdbool.h>
367e559c
MD
15#include <stdio.h>
16
17#ifdef HAVE_LIBNUMA
18# include <numa.h>
19# include <numaif.h>
20#endif
ef6695f1
MD
21
22/*
23 * rseq-percpu-alloc.c: rseq per-cpu memory allocator.
24 *
25 * The rseq per-cpu memory allocator allows the application the request
26 * memory pools of per-cpu memory each of a given virtual address size
27 * per-cpu, for a given maximum number of CPUs.
28 */
29
72b100a1
MD
30/*
31 * Use high bits of per-cpu addresses to index the pool.
32 * This leaves the low bits of available to the application for pointer
33 * tagging (based on next power of 2 alignment of the allocations).
34 */
ef6695f1 35#if RSEQ_BITS_PER_LONG == 64
72b100a1 36# define POOL_INDEX_BITS 16
ef6695f1 37#else
72b100a1 38# define POOL_INDEX_BITS 8
ef6695f1 39#endif
72b100a1
MD
40#define MAX_NR_POOLS (1UL << POOL_INDEX_BITS)
41#define POOL_INDEX_SHIFT (RSEQ_BITS_PER_LONG - POOL_INDEX_BITS)
42#define MAX_POOL_LEN (1UL << POOL_INDEX_SHIFT)
43#define MAX_POOL_LEN_MASK (MAX_POOL_LEN - 1)
ef6695f1 44
72b100a1 45#define POOL_SET_NR_ENTRIES POOL_INDEX_SHIFT
ef6695f1 46
72b100a1
MD
47/*
48 * Smallest allocation should hold enough space for a free list pointer.
49 */
ef6695f1
MD
50#if RSEQ_BITS_PER_LONG == 64
51# define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
52#else
53# define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
54#endif
55
56#define DEFAULT_PAGE_SIZE 4096
57
58struct free_list_node;
59
60struct free_list_node {
61 struct free_list_node *next;
62};
63
64/* This lock protects pool create/destroy. */
65static pthread_mutex_t pool_lock = PTHREAD_MUTEX_INITIALIZER;
66
67struct rseq_percpu_pool {
68 void *base;
69 unsigned int index;
70 size_t item_len;
71 size_t percpu_len;
72 int item_order;
73 int max_nr_cpus;
74
75 /*
76 * The free list chains freed items on the cpu 0 address range.
77 * We should rethink this decision if false sharing between
78 * malloc/free from other cpus and data accesses from cpu 0
79 * becomes an issue. This is a NULL-terminated singly-linked
80 * list.
81 */
82 struct free_list_node *free_list_head;
83 size_t next_unused;
84 /* This lock protects allocation/free within the pool. */
85 pthread_mutex_t lock;
86};
87
88//TODO: the array of pools should grow dynamically on create.
89static struct rseq_percpu_pool rseq_percpu_pool[MAX_NR_POOLS];
90
91/*
92 * Pool set entries are indexed by item_len rounded to the next power of
93 * 2. A pool set can contain NULL pool entries, in which case the next
94 * large enough entry will be used for allocation.
95 */
96struct rseq_percpu_pool_set {
97 /* This lock protects add vs malloc/zmalloc within the pool set. */
98 pthread_mutex_t lock;
99 struct rseq_percpu_pool *entries[POOL_SET_NR_ENTRIES];
100};
101
102#define __rseq_align_mask(v, mask) (((v) + (mask)) & ~(mask))
103#define rseq_align(v, align) __rseq_align_mask(v, (__typeof__(v)) (align) - 1)
104
105static __attribute__((unused))
106unsigned int fls_u64(uint64_t x)
107{
108 unsigned int r = 64;
109
110 if (!x)
111 return 0;
112
113 if (!(x & 0xFFFFFFFF00000000ULL)) {
114 x <<= 32;
115 r -= 32;
116 }
117 if (!(x & 0xFFFF000000000000ULL)) {
118 x <<= 16;
119 r -= 16;
120 }
121 if (!(x & 0xFF00000000000000ULL)) {
122 x <<= 8;
123 r -= 8;
124 }
125 if (!(x & 0xF000000000000000ULL)) {
126 x <<= 4;
127 r -= 4;
128 }
129 if (!(x & 0xC000000000000000ULL)) {
130 x <<= 2;
131 r -= 2;
132 }
133 if (!(x & 0x8000000000000000ULL)) {
134 x <<= 1;
135 r -= 1;
136 }
137 return r;
138}
139
140static __attribute__((unused))
141unsigned int fls_u32(uint32_t x)
142{
143 unsigned int r = 32;
144
145 if (!x)
146 return 0;
147 if (!(x & 0xFFFF0000U)) {
148 x <<= 16;
149 r -= 16;
150 }
151 if (!(x & 0xFF000000U)) {
152 x <<= 8;
153 r -= 8;
154 }
155 if (!(x & 0xF0000000U)) {
156 x <<= 4;
157 r -= 4;
158 }
159 if (!(x & 0xC0000000U)) {
160 x <<= 2;
161 r -= 2;
162 }
163 if (!(x & 0x80000000U)) {
164 x <<= 1;
165 r -= 1;
166 }
167 return r;
168}
169
170static
171unsigned int fls_ulong(unsigned long x)
172{
173#if RSEQ_BITS_PER_LONG == 32
174 return fls_u32(x);
175#else
176 return fls_u64(x);
177#endif
178}
179
180/*
181 * Return the minimum order for which x <= (1UL << order).
182 * Return -1 if x is 0.
183 */
184static
185int get_count_order_ulong(unsigned long x)
186{
187 if (!x)
188 return -1;
189
190 return fls_ulong(x - 1);
191}
192
367e559c
MD
193static
194long rseq_get_page_len(void)
195{
196 long page_len = sysconf(_SC_PAGE_SIZE);
197
198 if (page_len < 0)
199 page_len = DEFAULT_PAGE_SIZE;
200 return page_len;
201}
202
203static
204void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool *pool, int cpu, uintptr_t item_offset)
205{
206 return pool->base + (pool->percpu_len * cpu) + item_offset;
207}
208
209void *__rseq_percpu_ptr(void *_ptr, int cpu)
210{
211 uintptr_t ptr = (uintptr_t) _ptr;
72b100a1
MD
212 uintptr_t item_offset = ptr & MAX_POOL_LEN_MASK;
213 uintptr_t pool_index = ptr >> POOL_INDEX_SHIFT;
367e559c
MD
214 struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
215
216 assert(cpu >= 0);
217 return __rseq_pool_percpu_ptr(pool, cpu, item_offset);
218}
219
220static
221void rseq_percpu_zero_item(struct rseq_percpu_pool *pool, uintptr_t item_offset)
222{
223 int i;
224
225 for (i = 0; i < pool->max_nr_cpus; i++) {
226 char *p = __rseq_pool_percpu_ptr(pool, i, item_offset);
227 memset(p, 0, pool->item_len);
228 }
229}
230
231#ifdef HAVE_LIBNUMA
232static
233void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool,
234 int numa_flags)
235{
236 unsigned long nr_pages, page;
237 long ret, page_len;
238 int cpu;
239
240 if (!numa_flags)
241 return;
242 page_len = rseq_get_page_len();
243 nr_pages = pool->percpu_len >> get_count_order_ulong(page_len);
244 for (cpu = 0; cpu < pool->max_nr_cpus; cpu++) {
245 int node = numa_node_of_cpu(cpu);
246
247 /* TODO: batch move_pages() call with an array of pages. */
248 for (page = 0; page < nr_pages; page++) {
249 void *pageptr = __rseq_pool_percpu_ptr(pool, cpu, page * page_len);
250 int status = -EPERM;
251
252 ret = move_pages(0, 1, &pageptr, &node, &status, numa_flags);
253 if (ret) {
254 perror("move_pages");
255 abort();
256 }
257 }
258 }
259}
260#else
261static
262void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool __attribute__((unused)),
263 int numa_flags __attribute__((unused)))
264{
265}
266#endif
267
268/*
269 * Expected numa_flags:
270 * 0: do not move pages to specific numa nodes (use for e.g. mm_cid indexing).
271 * MPOL_MF_MOVE: move process-private pages to cpu-specific numa nodes.
272 * MPOL_MF_MOVE_ALL: move shared pages to cpu-specific numa nodes (requires CAP_SYS_NICE).
273 */
ef6695f1
MD
274struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len,
275 size_t percpu_len, int max_nr_cpus,
367e559c
MD
276 int mmap_prot, int mmap_flags, int mmap_fd,
277 off_t mmap_offset, int numa_flags)
ef6695f1
MD
278{
279 struct rseq_percpu_pool *pool;
280 void *base;
281 unsigned int i;
282 int order;
ef6695f1
MD
283
284 /* Make sure each item is large enough to contain free list pointers. */
285 if (item_len < sizeof(void *))
286 item_len = sizeof(void *);
287
288 /* Align item_len on next power of two. */
289 order = get_count_order_ulong(item_len);
290 if (order < 0) {
291 errno = EINVAL;
292 return NULL;
293 }
294 item_len = 1UL << order;
295
296 /* Align percpu_len on page size. */
367e559c 297 percpu_len = rseq_align(percpu_len, rseq_get_page_len());
ef6695f1
MD
298
299 if (max_nr_cpus < 0 || item_len > percpu_len ||
72b100a1 300 percpu_len > (UINTPTR_MAX >> POOL_INDEX_BITS)) {
ef6695f1
MD
301 errno = EINVAL;
302 return NULL;
303 }
304
305 pthread_mutex_lock(&pool_lock);
306 /* Linear scan in array of pools to find empty spot. */
307 for (i = 0; i < MAX_NR_POOLS; i++) {
308 pool = &rseq_percpu_pool[i];
309 if (!pool->base)
310 goto found_empty;
311 }
312 errno = ENOMEM;
313 pool = NULL;
314 goto end;
315
316found_empty:
367e559c
MD
317 base = mmap(NULL, percpu_len * max_nr_cpus, mmap_prot,
318 mmap_flags, mmap_fd, mmap_offset);
ef6695f1
MD
319 if (base == MAP_FAILED) {
320 pool = NULL;
321 goto end;
322 }
367e559c 323 rseq_percpu_pool_init_numa(pool, numa_flags);
ef6695f1
MD
324 pthread_mutex_init(&pool->lock, NULL);
325 pool->base = base;
326 pool->percpu_len = percpu_len;
327 pool->max_nr_cpus = max_nr_cpus;
328 pool->index = i;
329 pool->item_len = item_len;
330 pool->item_order = order;
331end:
332 pthread_mutex_unlock(&pool_lock);
333 return pool;
334}
335
336int rseq_percpu_pool_destroy(struct rseq_percpu_pool *pool)
337{
338 int ret;
339
340 pthread_mutex_lock(&pool_lock);
341 if (!pool->base) {
342 errno = ENOENT;
343 ret = -1;
344 goto end;
345 }
346 ret = munmap(pool->base, pool->percpu_len * pool->max_nr_cpus);
347 if (ret)
348 goto end;
349 pthread_mutex_destroy(&pool->lock);
350 memset(pool, 0, sizeof(*pool));
351end:
352 pthread_mutex_unlock(&pool_lock);
353 return 0;
354}
355
ef6695f1
MD
356static
357void *__rseq_percpu_malloc(struct rseq_percpu_pool *pool, bool zeroed)
358{
359 struct free_list_node *node;
360 uintptr_t item_offset;
361 void *addr;
362
363 pthread_mutex_lock(&pool->lock);
364 /* Get first entry from free list. */
365 node = pool->free_list_head;
366 if (node != NULL) {
367 /* Remove node from free list (update head). */
368 pool->free_list_head = node->next;
369 item_offset = (uintptr_t) ((void *) node - pool->base);
72b100a1 370 addr = (void *) (((uintptr_t) pool->index << POOL_INDEX_SHIFT) | item_offset);
ef6695f1
MD
371 goto end;
372 }
373 if (pool->next_unused + pool->item_len > pool->percpu_len) {
374 addr = NULL;
375 goto end;
376 }
377 item_offset = pool->next_unused;
72b100a1 378 addr = (void *) (((uintptr_t) pool->index << POOL_INDEX_SHIFT) | item_offset);
ef6695f1
MD
379 pool->next_unused += pool->item_len;
380end:
381 pthread_mutex_unlock(&pool->lock);
382 if (zeroed && addr)
383 rseq_percpu_zero_item(pool, item_offset);
384 return addr;
385}
386
387void *rseq_percpu_malloc(struct rseq_percpu_pool *pool)
388{
389 return __rseq_percpu_malloc(pool, false);
390}
391
392void *rseq_percpu_zmalloc(struct rseq_percpu_pool *pool)
393{
394 return __rseq_percpu_malloc(pool, true);
395}
396
397void rseq_percpu_free(void *_ptr)
398{
399 uintptr_t ptr = (uintptr_t) _ptr;
72b100a1
MD
400 uintptr_t item_offset = ptr & MAX_POOL_LEN_MASK;
401 uintptr_t pool_index = ptr >> POOL_INDEX_SHIFT;
ef6695f1
MD
402 struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
403 struct free_list_node *head, *item;
404
405 pthread_mutex_lock(&pool->lock);
406 /* Add ptr to head of free list */
407 head = pool->free_list_head;
408 /* Free-list is in cpu 0 range. */
409 item = (struct free_list_node *)__rseq_pool_percpu_ptr(pool, 0, item_offset);
410 item->next = head;
411 pool->free_list_head = item;
412 pthread_mutex_unlock(&pool->lock);
413}
414
415struct rseq_percpu_pool_set *rseq_percpu_pool_set_create(void)
416{
417 struct rseq_percpu_pool_set *pool_set;
418
419 pool_set = calloc(1, sizeof(struct rseq_percpu_pool_set));
420 if (!pool_set)
421 return NULL;
422 pthread_mutex_init(&pool_set->lock, NULL);
423 return pool_set;
424}
425
426int rseq_percpu_pool_set_destroy(struct rseq_percpu_pool_set *pool_set)
427{
428 int order, ret;
429
430 for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) {
431 struct rseq_percpu_pool *pool = pool_set->entries[order];
432
433 if (!pool)
434 continue;
435 ret = rseq_percpu_pool_destroy(pool);
436 if (ret)
437 return ret;
438 pool_set->entries[order] = NULL;
439 }
440 pthread_mutex_destroy(&pool_set->lock);
441 free(pool_set);
442 return 0;
443}
444
445/* Ownership of pool is handed over to pool set on success. */
446int rseq_percpu_pool_set_add_pool(struct rseq_percpu_pool_set *pool_set, struct rseq_percpu_pool *pool)
447{
448 size_t item_order = pool->item_order;
449 int ret = 0;
450
451 pthread_mutex_lock(&pool_set->lock);
452 if (pool_set->entries[item_order]) {
453 errno = EBUSY;
454 ret = -1;
455 goto end;
456 }
457 pool_set->entries[pool->item_order] = pool;
458end:
459 pthread_mutex_unlock(&pool_set->lock);
460 return ret;
461}
462
463static
464void *__rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len, bool zeroed)
465{
466 int order, min_order = POOL_SET_MIN_ENTRY;
467 struct rseq_percpu_pool *pool;
468 void *addr;
469
470again:
471 pthread_mutex_lock(&pool_set->lock);
472 /* First smallest present pool where @len fits. */
473 for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) {
474 pool = pool_set->entries[order];
475
476 if (!pool)
477 continue;
478 if (pool->item_len >= len)
479 goto found;
480 }
481 pool = NULL;
482found:
483 pthread_mutex_unlock(&pool_set->lock);
484 if (pool) {
485 addr = __rseq_percpu_malloc(pool, zeroed);
486 if (addr == NULL && errno == ENOMEM) {
487 /*
488 * If the allocation failed, try again with a
489 * larger pool.
490 */
491 min_order = order + 1;
492 goto again;
493 }
494 } else {
495 /* Not found. */
496 errno = ENOMEM;
497 addr = NULL;
498 }
499 return addr;
500}
501
502void *rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len)
503{
504 return __rseq_percpu_pool_set_malloc(pool_set, len, false);
505}
506
507void *rseq_percpu_pool_set_zmalloc(struct rseq_percpu_pool_set *pool_set, size_t len)
508{
509 return __rseq_percpu_pool_set_malloc(pool_set, len, true);
510}
This page took 0.045087 seconds and 4 git commands to generate.