percpu allocator: Add flags argument for future extensions
[librseq.git] / src / rseq-percpu-alloc.c
CommitLineData
ef6695f1
MD
1// SPDX-License-Identifier: MIT
2// SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3
4#include <rseq/percpu-alloc.h>
5#include <sys/mman.h>
6#include <assert.h>
7#include <string.h>
8#include <pthread.h>
9#include <unistd.h>
10#include <stdlib.h>
11#include <rseq/compiler.h>
12#include <errno.h>
13#include <stdint.h>
14#include <stdbool.h>
367e559c
MD
15#include <stdio.h>
16
17#ifdef HAVE_LIBNUMA
18# include <numa.h>
19# include <numaif.h>
20#endif
ef6695f1 21
19be9217
MD
22#include "rseq-alloc-utils.h"
23
ef6695f1 24/*
8ab16a24 25 * rseq-percpu-alloc.c: rseq CPU-Local Storage (CLS) memory allocator.
ef6695f1 26 *
8ab16a24
MD
27 * The rseq per-CPU memory allocator allows the application the request
28 * memory pools of CPU-Local memory each of containing objects of a
8aa1462d
MD
29 * given size (rounded to next power of 2), reserving a given virtual
30 * address size per CPU, for a given maximum number of CPUs.
8ab16a24
MD
31 *
32 * The per-CPU memory allocator is analogous to TLS (Thread-Local
33 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
34 * memory allocator provides CPU-Local Storage.
ef6695f1
MD
35 */
36
72b100a1 37/*
8ab16a24 38 * Use high bits of per-CPU addresses to index the pool.
72b100a1
MD
39 * This leaves the low bits of available to the application for pointer
40 * tagging (based on next power of 2 alignment of the allocations).
41 */
ef6695f1 42#if RSEQ_BITS_PER_LONG == 64
72b100a1 43# define POOL_INDEX_BITS 16
ef6695f1 44#else
72b100a1 45# define POOL_INDEX_BITS 8
ef6695f1 46#endif
72b100a1
MD
47#define MAX_NR_POOLS (1UL << POOL_INDEX_BITS)
48#define POOL_INDEX_SHIFT (RSEQ_BITS_PER_LONG - POOL_INDEX_BITS)
49#define MAX_POOL_LEN (1UL << POOL_INDEX_SHIFT)
50#define MAX_POOL_LEN_MASK (MAX_POOL_LEN - 1)
ef6695f1 51
72b100a1 52#define POOL_SET_NR_ENTRIES POOL_INDEX_SHIFT
ef6695f1 53
72b100a1
MD
54/*
55 * Smallest allocation should hold enough space for a free list pointer.
56 */
ef6695f1
MD
57#if RSEQ_BITS_PER_LONG == 64
58# define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
59#else
60# define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
61#endif
62
bb1552e2
MD
63/*
64 * Skip pool index 0 to ensure allocated entries at index 0 do not match
65 * a NULL pointer.
66 */
67#define FIRST_POOL 1
68
ef6695f1
MD
69struct free_list_node;
70
71struct free_list_node {
72 struct free_list_node *next;
73};
74
75/* This lock protects pool create/destroy. */
76static pthread_mutex_t pool_lock = PTHREAD_MUTEX_INITIALIZER;
77
9bd07c29
MD
78struct rseq_mmap_attr {
79 void *(*mmap_func)(void *priv, size_t len);
80 int (*munmap_func)(void *priv, void *ptr, size_t len);
81 void *mmap_priv;
82};
83
ef6695f1
MD
84struct rseq_percpu_pool {
85 void *base;
86 unsigned int index;
87 size_t item_len;
88 size_t percpu_len;
89 int item_order;
90 int max_nr_cpus;
91
92 /*
8ab16a24 93 * The free list chains freed items on the CPU 0 address range.
ef6695f1 94 * We should rethink this decision if false sharing between
8ab16a24 95 * malloc/free from other CPUs and data accesses from CPU 0
ef6695f1
MD
96 * becomes an issue. This is a NULL-terminated singly-linked
97 * list.
98 */
99 struct free_list_node *free_list_head;
100 size_t next_unused;
101 /* This lock protects allocation/free within the pool. */
102 pthread_mutex_t lock;
9bd07c29
MD
103
104 struct rseq_mmap_attr mmap_attr;
ef6695f1
MD
105};
106
107//TODO: the array of pools should grow dynamically on create.
108static struct rseq_percpu_pool rseq_percpu_pool[MAX_NR_POOLS];
109
110/*
111 * Pool set entries are indexed by item_len rounded to the next power of
112 * 2. A pool set can contain NULL pool entries, in which case the next
113 * large enough entry will be used for allocation.
114 */
115struct rseq_percpu_pool_set {
116 /* This lock protects add vs malloc/zmalloc within the pool set. */
117 pthread_mutex_t lock;
118 struct rseq_percpu_pool *entries[POOL_SET_NR_ENTRIES];
119};
120
367e559c
MD
121static
122void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool *pool, int cpu, uintptr_t item_offset)
123{
124 return pool->base + (pool->percpu_len * cpu) + item_offset;
125}
126
8fde749f
MD
127ptrdiff_t rseq_percpu_pool_ptr_offset(struct rseq_percpu_pool *pool, int cpu)
128{
129 uintptr_t rseq_percpu_base = (uintptr_t) pool->index << POOL_INDEX_SHIFT;
130 uintptr_t refptr = (uintptr_t) __rseq_pool_percpu_ptr(pool, cpu, 0);
131
132 return (ptrdiff_t) (refptr - rseq_percpu_base);
133}
134
d24ee051 135void *__rseq_percpu_ptr(void __rseq_percpu *_ptr, int cpu)
367e559c
MD
136{
137 uintptr_t ptr = (uintptr_t) _ptr;
72b100a1
MD
138 uintptr_t item_offset = ptr & MAX_POOL_LEN_MASK;
139 uintptr_t pool_index = ptr >> POOL_INDEX_SHIFT;
367e559c
MD
140 struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
141
142 assert(cpu >= 0);
143 return __rseq_pool_percpu_ptr(pool, cpu, item_offset);
144}
145
146static
147void rseq_percpu_zero_item(struct rseq_percpu_pool *pool, uintptr_t item_offset)
148{
149 int i;
150
151 for (i = 0; i < pool->max_nr_cpus; i++) {
152 char *p = __rseq_pool_percpu_ptr(pool, i, item_offset);
153 memset(p, 0, pool->item_len);
154 }
155}
156
157#ifdef HAVE_LIBNUMA
9bd07c29 158int rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool, int numa_flags)
367e559c
MD
159{
160 unsigned long nr_pages, page;
161 long ret, page_len;
162 int cpu;
163
164 if (!numa_flags)
9bd07c29 165 return 0;
367e559c 166 page_len = rseq_get_page_len();
19be9217 167 nr_pages = pool->percpu_len >> rseq_get_count_order_ulong(page_len);
367e559c
MD
168 for (cpu = 0; cpu < pool->max_nr_cpus; cpu++) {
169 int node = numa_node_of_cpu(cpu);
170
171 /* TODO: batch move_pages() call with an array of pages. */
172 for (page = 0; page < nr_pages; page++) {
173 void *pageptr = __rseq_pool_percpu_ptr(pool, cpu, page * page_len);
174 int status = -EPERM;
175
176 ret = move_pages(0, 1, &pageptr, &node, &status, numa_flags);
9bd07c29
MD
177 if (ret)
178 return ret;
367e559c
MD
179 }
180 }
9bd07c29 181 return 0;
367e559c
MD
182}
183#else
367e559c
MD
184void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool __attribute__((unused)),
185 int numa_flags __attribute__((unused)))
186{
9bd07c29 187 return 0;
367e559c
MD
188}
189#endif
190
9bd07c29
MD
191static
192void *default_mmap_func(void *priv __attribute__((unused)), size_t len)
193{
194 void *base;
195
196 base = mmap(NULL, len, PROT_READ | PROT_WRITE,
197 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
198 if (base == MAP_FAILED)
199 return NULL;
200 return base;
201}
202
203static
204int default_munmap_func(void *priv __attribute__((unused)), void *ptr, size_t len)
205{
206 return munmap(ptr, len);
207}
208
ef6695f1
MD
209struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len,
210 size_t percpu_len, int max_nr_cpus,
6b30db4e
MD
211 const struct rseq_mmap_attr *mmap_attr,
212 int flags)
ef6695f1 213{
9bd07c29
MD
214 void *(*mmap_func)(void *priv, size_t len);
215 int (*munmap_func)(void *priv, void *ptr, size_t len);
216 void *mmap_priv;
ef6695f1
MD
217 struct rseq_percpu_pool *pool;
218 void *base;
219 unsigned int i;
220 int order;
ef6695f1 221
6b30db4e
MD
222 if (flags) {
223 errno = EINVAL;
224 return NULL;
225 }
226
ef6695f1
MD
227 /* Make sure each item is large enough to contain free list pointers. */
228 if (item_len < sizeof(void *))
229 item_len = sizeof(void *);
230
231 /* Align item_len on next power of two. */
19be9217 232 order = rseq_get_count_order_ulong(item_len);
ef6695f1
MD
233 if (order < 0) {
234 errno = EINVAL;
235 return NULL;
236 }
237 item_len = 1UL << order;
238
239 /* Align percpu_len on page size. */
367e559c 240 percpu_len = rseq_align(percpu_len, rseq_get_page_len());
ef6695f1
MD
241
242 if (max_nr_cpus < 0 || item_len > percpu_len ||
72b100a1 243 percpu_len > (UINTPTR_MAX >> POOL_INDEX_BITS)) {
ef6695f1
MD
244 errno = EINVAL;
245 return NULL;
246 }
247
9bd07c29
MD
248 if (mmap_attr) {
249 mmap_func = mmap_attr->mmap_func;
250 munmap_func = mmap_attr->munmap_func;
251 mmap_priv = mmap_attr->mmap_priv;
252 } else {
253 mmap_func = default_mmap_func;
254 munmap_func = default_munmap_func;
255 mmap_priv = NULL;
256 }
ef6695f1
MD
257 pthread_mutex_lock(&pool_lock);
258 /* Linear scan in array of pools to find empty spot. */
bb1552e2 259 for (i = FIRST_POOL; i < MAX_NR_POOLS; i++) {
ef6695f1
MD
260 pool = &rseq_percpu_pool[i];
261 if (!pool->base)
262 goto found_empty;
263 }
264 errno = ENOMEM;
265 pool = NULL;
266 goto end;
267
268found_empty:
9bd07c29
MD
269 base = mmap_func(mmap_priv, percpu_len * max_nr_cpus);
270 if (!base) {
ef6695f1
MD
271 pool = NULL;
272 goto end;
273 }
ef6695f1
MD
274 pthread_mutex_init(&pool->lock, NULL);
275 pool->base = base;
276 pool->percpu_len = percpu_len;
277 pool->max_nr_cpus = max_nr_cpus;
278 pool->index = i;
279 pool->item_len = item_len;
280 pool->item_order = order;
9bd07c29
MD
281 pool->mmap_attr.mmap_func = mmap_func;
282 pool->mmap_attr.munmap_func = munmap_func;
283 pool->mmap_attr.mmap_priv = mmap_priv;
ef6695f1
MD
284end:
285 pthread_mutex_unlock(&pool_lock);
286 return pool;
287}
288
289int rseq_percpu_pool_destroy(struct rseq_percpu_pool *pool)
290{
291 int ret;
292
293 pthread_mutex_lock(&pool_lock);
294 if (!pool->base) {
295 errno = ENOENT;
296 ret = -1;
297 goto end;
298 }
9bd07c29
MD
299 ret = pool->mmap_attr.munmap_func(pool->mmap_attr.mmap_priv, pool->base,
300 pool->percpu_len * pool->max_nr_cpus);
ef6695f1
MD
301 if (ret)
302 goto end;
303 pthread_mutex_destroy(&pool->lock);
304 memset(pool, 0, sizeof(*pool));
305end:
306 pthread_mutex_unlock(&pool_lock);
307 return 0;
308}
309
ef6695f1 310static
d24ee051 311void __rseq_percpu *__rseq_percpu_malloc(struct rseq_percpu_pool *pool, bool zeroed)
ef6695f1
MD
312{
313 struct free_list_node *node;
314 uintptr_t item_offset;
d24ee051 315 void __rseq_percpu *addr;
ef6695f1
MD
316
317 pthread_mutex_lock(&pool->lock);
318 /* Get first entry from free list. */
319 node = pool->free_list_head;
320 if (node != NULL) {
321 /* Remove node from free list (update head). */
322 pool->free_list_head = node->next;
323 item_offset = (uintptr_t) ((void *) node - pool->base);
72b100a1 324 addr = (void *) (((uintptr_t) pool->index << POOL_INDEX_SHIFT) | item_offset);
ef6695f1
MD
325 goto end;
326 }
327 if (pool->next_unused + pool->item_len > pool->percpu_len) {
ea1a3ada 328 errno = ENOMEM;
ef6695f1
MD
329 addr = NULL;
330 goto end;
331 }
332 item_offset = pool->next_unused;
72b100a1 333 addr = (void *) (((uintptr_t) pool->index << POOL_INDEX_SHIFT) | item_offset);
ef6695f1
MD
334 pool->next_unused += pool->item_len;
335end:
336 pthread_mutex_unlock(&pool->lock);
337 if (zeroed && addr)
338 rseq_percpu_zero_item(pool, item_offset);
339 return addr;
340}
341
d24ee051 342void __rseq_percpu *rseq_percpu_malloc(struct rseq_percpu_pool *pool)
ef6695f1
MD
343{
344 return __rseq_percpu_malloc(pool, false);
345}
346
d24ee051 347void __rseq_percpu *rseq_percpu_zmalloc(struct rseq_percpu_pool *pool)
ef6695f1
MD
348{
349 return __rseq_percpu_malloc(pool, true);
350}
351
d24ee051 352void rseq_percpu_free(void __rseq_percpu *_ptr)
ef6695f1
MD
353{
354 uintptr_t ptr = (uintptr_t) _ptr;
72b100a1
MD
355 uintptr_t item_offset = ptr & MAX_POOL_LEN_MASK;
356 uintptr_t pool_index = ptr >> POOL_INDEX_SHIFT;
ef6695f1
MD
357 struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
358 struct free_list_node *head, *item;
359
360 pthread_mutex_lock(&pool->lock);
361 /* Add ptr to head of free list */
362 head = pool->free_list_head;
8ab16a24 363 /* Free-list is in CPU 0 range. */
ef6695f1
MD
364 item = (struct free_list_node *)__rseq_pool_percpu_ptr(pool, 0, item_offset);
365 item->next = head;
366 pool->free_list_head = item;
367 pthread_mutex_unlock(&pool->lock);
368}
369
370struct rseq_percpu_pool_set *rseq_percpu_pool_set_create(void)
371{
372 struct rseq_percpu_pool_set *pool_set;
373
374 pool_set = calloc(1, sizeof(struct rseq_percpu_pool_set));
375 if (!pool_set)
376 return NULL;
377 pthread_mutex_init(&pool_set->lock, NULL);
378 return pool_set;
379}
380
381int rseq_percpu_pool_set_destroy(struct rseq_percpu_pool_set *pool_set)
382{
383 int order, ret;
384
385 for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) {
386 struct rseq_percpu_pool *pool = pool_set->entries[order];
387
388 if (!pool)
389 continue;
390 ret = rseq_percpu_pool_destroy(pool);
391 if (ret)
392 return ret;
393 pool_set->entries[order] = NULL;
394 }
395 pthread_mutex_destroy(&pool_set->lock);
396 free(pool_set);
397 return 0;
398}
399
400/* Ownership of pool is handed over to pool set on success. */
401int rseq_percpu_pool_set_add_pool(struct rseq_percpu_pool_set *pool_set, struct rseq_percpu_pool *pool)
402{
403 size_t item_order = pool->item_order;
404 int ret = 0;
405
406 pthread_mutex_lock(&pool_set->lock);
407 if (pool_set->entries[item_order]) {
408 errno = EBUSY;
409 ret = -1;
410 goto end;
411 }
412 pool_set->entries[pool->item_order] = pool;
413end:
414 pthread_mutex_unlock(&pool_set->lock);
415 return ret;
416}
417
418static
d24ee051 419void __rseq_percpu *__rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len, bool zeroed)
ef6695f1
MD
420{
421 int order, min_order = POOL_SET_MIN_ENTRY;
422 struct rseq_percpu_pool *pool;
d24ee051 423 void __rseq_percpu *addr;
ef6695f1 424
d06f5cf5
MD
425 order = rseq_get_count_order_ulong(len);
426 if (order > POOL_SET_MIN_ENTRY)
427 min_order = order;
ef6695f1
MD
428again:
429 pthread_mutex_lock(&pool_set->lock);
430 /* First smallest present pool where @len fits. */
431 for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) {
432 pool = pool_set->entries[order];
433
434 if (!pool)
435 continue;
436 if (pool->item_len >= len)
437 goto found;
438 }
439 pool = NULL;
440found:
441 pthread_mutex_unlock(&pool_set->lock);
442 if (pool) {
443 addr = __rseq_percpu_malloc(pool, zeroed);
444 if (addr == NULL && errno == ENOMEM) {
445 /*
446 * If the allocation failed, try again with a
447 * larger pool.
448 */
449 min_order = order + 1;
450 goto again;
451 }
452 } else {
453 /* Not found. */
454 errno = ENOMEM;
455 addr = NULL;
456 }
457 return addr;
458}
459
d24ee051 460void __rseq_percpu *rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len)
ef6695f1
MD
461{
462 return __rseq_percpu_pool_set_malloc(pool_set, len, false);
463}
464
d24ee051 465void __rseq_percpu *rseq_percpu_pool_set_zmalloc(struct rseq_percpu_pool_set *pool_set, size_t len)
ef6695f1
MD
466{
467 return __rseq_percpu_pool_set_malloc(pool_set, len, true);
468}
9bd07c29
MD
469
470struct rseq_mmap_attr *rseq_mmap_attr_create(void *(*mmap_func)(void *priv, size_t len),
471 int (*munmap_func)(void *priv, void *ptr, size_t len),
472 void *mmap_priv)
473{
474 struct rseq_mmap_attr *attr = calloc(1, sizeof(struct rseq_mmap_attr));
475
476 if (!attr)
477 return NULL;
478 attr->mmap_func = mmap_func;
479 attr->munmap_func = munmap_func;
480 attr->mmap_priv = mmap_priv;
481 return attr;
482}
483
484void rseq_mmap_attr_destroy(struct rseq_mmap_attr *attr)
485{
486 free(attr);
487}
This page took 0.043622 seconds and 4 git commands to generate.