rseq percpu alloc: Implement numa support
[librseq.git] / src / rseq-percpu-alloc.c
CommitLineData
ef6695f1
MD
1// SPDX-License-Identifier: MIT
2// SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3
4#include <rseq/percpu-alloc.h>
5#include <sys/mman.h>
6#include <assert.h>
7#include <string.h>
8#include <pthread.h>
9#include <unistd.h>
10#include <stdlib.h>
11#include <rseq/compiler.h>
12#include <errno.h>
13#include <stdint.h>
14#include <stdbool.h>
367e559c
MD
15#include <stdio.h>
16
17#ifdef HAVE_LIBNUMA
18# include <numa.h>
19# include <numaif.h>
20#endif
ef6695f1
MD
21
22/*
23 * rseq-percpu-alloc.c: rseq per-cpu memory allocator.
24 *
25 * The rseq per-cpu memory allocator allows the application the request
26 * memory pools of per-cpu memory each of a given virtual address size
27 * per-cpu, for a given maximum number of CPUs.
28 */
29
30/* Use lowest bits of per-cpu addresses to index the pool. */
31#if RSEQ_BITS_PER_LONG == 64
32# define OFFSET_SHIFT 16
33#else
34# define OFFSET_SHIFT 8
35#endif
36#define MAX_NR_POOLS (1U << OFFSET_SHIFT)
37#define POOL_MASK (MAX_NR_POOLS - 1)
38
39#define POOL_SET_NR_ENTRIES (RSEQ_BITS_PER_LONG - OFFSET_SHIFT)
40#define POOL_SET_MAX_ALLOC_LEN (1U << POOL_SET_NR_ENTRIES)
41
42#if RSEQ_BITS_PER_LONG == 64
43# define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
44#else
45# define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
46#endif
47
48#define DEFAULT_PAGE_SIZE 4096
49
50struct free_list_node;
51
52struct free_list_node {
53 struct free_list_node *next;
54};
55
56/* This lock protects pool create/destroy. */
57static pthread_mutex_t pool_lock = PTHREAD_MUTEX_INITIALIZER;
58
59struct rseq_percpu_pool {
60 void *base;
61 unsigned int index;
62 size_t item_len;
63 size_t percpu_len;
64 int item_order;
65 int max_nr_cpus;
66
67 /*
68 * The free list chains freed items on the cpu 0 address range.
69 * We should rethink this decision if false sharing between
70 * malloc/free from other cpus and data accesses from cpu 0
71 * becomes an issue. This is a NULL-terminated singly-linked
72 * list.
73 */
74 struct free_list_node *free_list_head;
75 size_t next_unused;
76 /* This lock protects allocation/free within the pool. */
77 pthread_mutex_t lock;
78};
79
80//TODO: the array of pools should grow dynamically on create.
81static struct rseq_percpu_pool rseq_percpu_pool[MAX_NR_POOLS];
82
83/*
84 * Pool set entries are indexed by item_len rounded to the next power of
85 * 2. A pool set can contain NULL pool entries, in which case the next
86 * large enough entry will be used for allocation.
87 */
88struct rseq_percpu_pool_set {
89 /* This lock protects add vs malloc/zmalloc within the pool set. */
90 pthread_mutex_t lock;
91 struct rseq_percpu_pool *entries[POOL_SET_NR_ENTRIES];
92};
93
94#define __rseq_align_mask(v, mask) (((v) + (mask)) & ~(mask))
95#define rseq_align(v, align) __rseq_align_mask(v, (__typeof__(v)) (align) - 1)
96
97static __attribute__((unused))
98unsigned int fls_u64(uint64_t x)
99{
100 unsigned int r = 64;
101
102 if (!x)
103 return 0;
104
105 if (!(x & 0xFFFFFFFF00000000ULL)) {
106 x <<= 32;
107 r -= 32;
108 }
109 if (!(x & 0xFFFF000000000000ULL)) {
110 x <<= 16;
111 r -= 16;
112 }
113 if (!(x & 0xFF00000000000000ULL)) {
114 x <<= 8;
115 r -= 8;
116 }
117 if (!(x & 0xF000000000000000ULL)) {
118 x <<= 4;
119 r -= 4;
120 }
121 if (!(x & 0xC000000000000000ULL)) {
122 x <<= 2;
123 r -= 2;
124 }
125 if (!(x & 0x8000000000000000ULL)) {
126 x <<= 1;
127 r -= 1;
128 }
129 return r;
130}
131
132static __attribute__((unused))
133unsigned int fls_u32(uint32_t x)
134{
135 unsigned int r = 32;
136
137 if (!x)
138 return 0;
139 if (!(x & 0xFFFF0000U)) {
140 x <<= 16;
141 r -= 16;
142 }
143 if (!(x & 0xFF000000U)) {
144 x <<= 8;
145 r -= 8;
146 }
147 if (!(x & 0xF0000000U)) {
148 x <<= 4;
149 r -= 4;
150 }
151 if (!(x & 0xC0000000U)) {
152 x <<= 2;
153 r -= 2;
154 }
155 if (!(x & 0x80000000U)) {
156 x <<= 1;
157 r -= 1;
158 }
159 return r;
160}
161
162static
163unsigned int fls_ulong(unsigned long x)
164{
165#if RSEQ_BITS_PER_LONG == 32
166 return fls_u32(x);
167#else
168 return fls_u64(x);
169#endif
170}
171
172/*
173 * Return the minimum order for which x <= (1UL << order).
174 * Return -1 if x is 0.
175 */
176static
177int get_count_order_ulong(unsigned long x)
178{
179 if (!x)
180 return -1;
181
182 return fls_ulong(x - 1);
183}
184
367e559c
MD
185static
186long rseq_get_page_len(void)
187{
188 long page_len = sysconf(_SC_PAGE_SIZE);
189
190 if (page_len < 0)
191 page_len = DEFAULT_PAGE_SIZE;
192 return page_len;
193}
194
195static
196void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool *pool, int cpu, uintptr_t item_offset)
197{
198 return pool->base + (pool->percpu_len * cpu) + item_offset;
199}
200
201void *__rseq_percpu_ptr(void *_ptr, int cpu)
202{
203 uintptr_t ptr = (uintptr_t) _ptr;
204 uintptr_t item_offset = ptr >> OFFSET_SHIFT;
205 uintptr_t pool_index = ptr & POOL_MASK;
206 struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
207
208 assert(cpu >= 0);
209 return __rseq_pool_percpu_ptr(pool, cpu, item_offset);
210}
211
212static
213void rseq_percpu_zero_item(struct rseq_percpu_pool *pool, uintptr_t item_offset)
214{
215 int i;
216
217 for (i = 0; i < pool->max_nr_cpus; i++) {
218 char *p = __rseq_pool_percpu_ptr(pool, i, item_offset);
219 memset(p, 0, pool->item_len);
220 }
221}
222
223#ifdef HAVE_LIBNUMA
224static
225void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool,
226 int numa_flags)
227{
228 unsigned long nr_pages, page;
229 long ret, page_len;
230 int cpu;
231
232 if (!numa_flags)
233 return;
234 page_len = rseq_get_page_len();
235 nr_pages = pool->percpu_len >> get_count_order_ulong(page_len);
236 for (cpu = 0; cpu < pool->max_nr_cpus; cpu++) {
237 int node = numa_node_of_cpu(cpu);
238
239 /* TODO: batch move_pages() call with an array of pages. */
240 for (page = 0; page < nr_pages; page++) {
241 void *pageptr = __rseq_pool_percpu_ptr(pool, cpu, page * page_len);
242 int status = -EPERM;
243
244 ret = move_pages(0, 1, &pageptr, &node, &status, numa_flags);
245 if (ret) {
246 perror("move_pages");
247 abort();
248 }
249 }
250 }
251}
252#else
253static
254void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool __attribute__((unused)),
255 int numa_flags __attribute__((unused)))
256{
257}
258#endif
259
260/*
261 * Expected numa_flags:
262 * 0: do not move pages to specific numa nodes (use for e.g. mm_cid indexing).
263 * MPOL_MF_MOVE: move process-private pages to cpu-specific numa nodes.
264 * MPOL_MF_MOVE_ALL: move shared pages to cpu-specific numa nodes (requires CAP_SYS_NICE).
265 */
ef6695f1
MD
266struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len,
267 size_t percpu_len, int max_nr_cpus,
367e559c
MD
268 int mmap_prot, int mmap_flags, int mmap_fd,
269 off_t mmap_offset, int numa_flags)
ef6695f1
MD
270{
271 struct rseq_percpu_pool *pool;
272 void *base;
273 unsigned int i;
274 int order;
ef6695f1
MD
275
276 /* Make sure each item is large enough to contain free list pointers. */
277 if (item_len < sizeof(void *))
278 item_len = sizeof(void *);
279
280 /* Align item_len on next power of two. */
281 order = get_count_order_ulong(item_len);
282 if (order < 0) {
283 errno = EINVAL;
284 return NULL;
285 }
286 item_len = 1UL << order;
287
288 /* Align percpu_len on page size. */
367e559c 289 percpu_len = rseq_align(percpu_len, rseq_get_page_len());
ef6695f1
MD
290
291 if (max_nr_cpus < 0 || item_len > percpu_len ||
292 percpu_len > (UINTPTR_MAX >> OFFSET_SHIFT)) {
293 errno = EINVAL;
294 return NULL;
295 }
296
297 pthread_mutex_lock(&pool_lock);
298 /* Linear scan in array of pools to find empty spot. */
299 for (i = 0; i < MAX_NR_POOLS; i++) {
300 pool = &rseq_percpu_pool[i];
301 if (!pool->base)
302 goto found_empty;
303 }
304 errno = ENOMEM;
305 pool = NULL;
306 goto end;
307
308found_empty:
367e559c
MD
309 base = mmap(NULL, percpu_len * max_nr_cpus, mmap_prot,
310 mmap_flags, mmap_fd, mmap_offset);
ef6695f1
MD
311 if (base == MAP_FAILED) {
312 pool = NULL;
313 goto end;
314 }
367e559c 315 rseq_percpu_pool_init_numa(pool, numa_flags);
ef6695f1
MD
316 pthread_mutex_init(&pool->lock, NULL);
317 pool->base = base;
318 pool->percpu_len = percpu_len;
319 pool->max_nr_cpus = max_nr_cpus;
320 pool->index = i;
321 pool->item_len = item_len;
322 pool->item_order = order;
323end:
324 pthread_mutex_unlock(&pool_lock);
325 return pool;
326}
327
328int rseq_percpu_pool_destroy(struct rseq_percpu_pool *pool)
329{
330 int ret;
331
332 pthread_mutex_lock(&pool_lock);
333 if (!pool->base) {
334 errno = ENOENT;
335 ret = -1;
336 goto end;
337 }
338 ret = munmap(pool->base, pool->percpu_len * pool->max_nr_cpus);
339 if (ret)
340 goto end;
341 pthread_mutex_destroy(&pool->lock);
342 memset(pool, 0, sizeof(*pool));
343end:
344 pthread_mutex_unlock(&pool_lock);
345 return 0;
346}
347
ef6695f1
MD
348static
349void *__rseq_percpu_malloc(struct rseq_percpu_pool *pool, bool zeroed)
350{
351 struct free_list_node *node;
352 uintptr_t item_offset;
353 void *addr;
354
355 pthread_mutex_lock(&pool->lock);
356 /* Get first entry from free list. */
357 node = pool->free_list_head;
358 if (node != NULL) {
359 /* Remove node from free list (update head). */
360 pool->free_list_head = node->next;
361 item_offset = (uintptr_t) ((void *) node - pool->base);
362 addr = (void *) ((item_offset << OFFSET_SHIFT) | pool->index);
363 goto end;
364 }
365 if (pool->next_unused + pool->item_len > pool->percpu_len) {
366 addr = NULL;
367 goto end;
368 }
369 item_offset = pool->next_unused;
370 addr = (void *) ((item_offset << OFFSET_SHIFT) | pool->index);
371 pool->next_unused += pool->item_len;
372end:
373 pthread_mutex_unlock(&pool->lock);
374 if (zeroed && addr)
375 rseq_percpu_zero_item(pool, item_offset);
376 return addr;
377}
378
379void *rseq_percpu_malloc(struct rseq_percpu_pool *pool)
380{
381 return __rseq_percpu_malloc(pool, false);
382}
383
384void *rseq_percpu_zmalloc(struct rseq_percpu_pool *pool)
385{
386 return __rseq_percpu_malloc(pool, true);
387}
388
389void rseq_percpu_free(void *_ptr)
390{
391 uintptr_t ptr = (uintptr_t) _ptr;
392 uintptr_t item_offset = ptr >> OFFSET_SHIFT;
393 uintptr_t pool_index = ptr & POOL_MASK;
394 struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
395 struct free_list_node *head, *item;
396
397 pthread_mutex_lock(&pool->lock);
398 /* Add ptr to head of free list */
399 head = pool->free_list_head;
400 /* Free-list is in cpu 0 range. */
401 item = (struct free_list_node *)__rseq_pool_percpu_ptr(pool, 0, item_offset);
402 item->next = head;
403 pool->free_list_head = item;
404 pthread_mutex_unlock(&pool->lock);
405}
406
407struct rseq_percpu_pool_set *rseq_percpu_pool_set_create(void)
408{
409 struct rseq_percpu_pool_set *pool_set;
410
411 pool_set = calloc(1, sizeof(struct rseq_percpu_pool_set));
412 if (!pool_set)
413 return NULL;
414 pthread_mutex_init(&pool_set->lock, NULL);
415 return pool_set;
416}
417
418int rseq_percpu_pool_set_destroy(struct rseq_percpu_pool_set *pool_set)
419{
420 int order, ret;
421
422 for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) {
423 struct rseq_percpu_pool *pool = pool_set->entries[order];
424
425 if (!pool)
426 continue;
427 ret = rseq_percpu_pool_destroy(pool);
428 if (ret)
429 return ret;
430 pool_set->entries[order] = NULL;
431 }
432 pthread_mutex_destroy(&pool_set->lock);
433 free(pool_set);
434 return 0;
435}
436
437/* Ownership of pool is handed over to pool set on success. */
438int rseq_percpu_pool_set_add_pool(struct rseq_percpu_pool_set *pool_set, struct rseq_percpu_pool *pool)
439{
440 size_t item_order = pool->item_order;
441 int ret = 0;
442
443 pthread_mutex_lock(&pool_set->lock);
444 if (pool_set->entries[item_order]) {
445 errno = EBUSY;
446 ret = -1;
447 goto end;
448 }
449 pool_set->entries[pool->item_order] = pool;
450end:
451 pthread_mutex_unlock(&pool_set->lock);
452 return ret;
453}
454
455static
456void *__rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len, bool zeroed)
457{
458 int order, min_order = POOL_SET_MIN_ENTRY;
459 struct rseq_percpu_pool *pool;
460 void *addr;
461
462again:
463 pthread_mutex_lock(&pool_set->lock);
464 /* First smallest present pool where @len fits. */
465 for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) {
466 pool = pool_set->entries[order];
467
468 if (!pool)
469 continue;
470 if (pool->item_len >= len)
471 goto found;
472 }
473 pool = NULL;
474found:
475 pthread_mutex_unlock(&pool_set->lock);
476 if (pool) {
477 addr = __rseq_percpu_malloc(pool, zeroed);
478 if (addr == NULL && errno == ENOMEM) {
479 /*
480 * If the allocation failed, try again with a
481 * larger pool.
482 */
483 min_order = order + 1;
484 goto again;
485 }
486 } else {
487 /* Not found. */
488 errno = ENOMEM;
489 addr = NULL;
490 }
491 return addr;
492}
493
494void *rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len)
495{
496 return __rseq_percpu_pool_set_malloc(pool_set, len, false);
497}
498
499void *rseq_percpu_pool_set_zmalloc(struct rseq_percpu_pool_set *pool_set, size_t len)
500{
501 return __rseq_percpu_pool_set_malloc(pool_set, len, true);
502}
This page took 0.060967 seconds and 4 git commands to generate.