percpu pool set malloc: start search at relevant alloc order
[librseq.git] / src / rseq-percpu-alloc.c
CommitLineData
ef6695f1
MD
1// SPDX-License-Identifier: MIT
2// SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3
4#include <rseq/percpu-alloc.h>
5#include <sys/mman.h>
6#include <assert.h>
7#include <string.h>
8#include <pthread.h>
9#include <unistd.h>
10#include <stdlib.h>
11#include <rseq/compiler.h>
12#include <errno.h>
13#include <stdint.h>
14#include <stdbool.h>
367e559c
MD
15#include <stdio.h>
16
17#ifdef HAVE_LIBNUMA
18# include <numa.h>
19# include <numaif.h>
20#endif
ef6695f1 21
19be9217
MD
22#include "rseq-alloc-utils.h"
23
ef6695f1 24/*
8ab16a24 25 * rseq-percpu-alloc.c: rseq CPU-Local Storage (CLS) memory allocator.
ef6695f1 26 *
8ab16a24
MD
27 * The rseq per-CPU memory allocator allows the application the request
28 * memory pools of CPU-Local memory each of containing objects of a
29 * given size (rounded to next power of 2), a given virtual address size
30 * per CPU, for a given maximum number of CPUs.
31 *
32 * The per-CPU memory allocator is analogous to TLS (Thread-Local
33 * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU
34 * memory allocator provides CPU-Local Storage.
ef6695f1
MD
35 */
36
72b100a1 37/*
8ab16a24 38 * Use high bits of per-CPU addresses to index the pool.
72b100a1
MD
39 * This leaves the low bits of available to the application for pointer
40 * tagging (based on next power of 2 alignment of the allocations).
41 */
ef6695f1 42#if RSEQ_BITS_PER_LONG == 64
72b100a1 43# define POOL_INDEX_BITS 16
ef6695f1 44#else
72b100a1 45# define POOL_INDEX_BITS 8
ef6695f1 46#endif
72b100a1
MD
47#define MAX_NR_POOLS (1UL << POOL_INDEX_BITS)
48#define POOL_INDEX_SHIFT (RSEQ_BITS_PER_LONG - POOL_INDEX_BITS)
49#define MAX_POOL_LEN (1UL << POOL_INDEX_SHIFT)
50#define MAX_POOL_LEN_MASK (MAX_POOL_LEN - 1)
ef6695f1 51
72b100a1 52#define POOL_SET_NR_ENTRIES POOL_INDEX_SHIFT
ef6695f1 53
72b100a1
MD
54/*
55 * Smallest allocation should hold enough space for a free list pointer.
56 */
ef6695f1
MD
57#if RSEQ_BITS_PER_LONG == 64
58# define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
59#else
60# define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
61#endif
62
ef6695f1
MD
63struct free_list_node;
64
65struct free_list_node {
66 struct free_list_node *next;
67};
68
69/* This lock protects pool create/destroy. */
70static pthread_mutex_t pool_lock = PTHREAD_MUTEX_INITIALIZER;
71
72struct rseq_percpu_pool {
73 void *base;
74 unsigned int index;
75 size_t item_len;
76 size_t percpu_len;
77 int item_order;
78 int max_nr_cpus;
79
80 /*
8ab16a24 81 * The free list chains freed items on the CPU 0 address range.
ef6695f1 82 * We should rethink this decision if false sharing between
8ab16a24 83 * malloc/free from other CPUs and data accesses from CPU 0
ef6695f1
MD
84 * becomes an issue. This is a NULL-terminated singly-linked
85 * list.
86 */
87 struct free_list_node *free_list_head;
88 size_t next_unused;
89 /* This lock protects allocation/free within the pool. */
90 pthread_mutex_t lock;
91};
92
93//TODO: the array of pools should grow dynamically on create.
94static struct rseq_percpu_pool rseq_percpu_pool[MAX_NR_POOLS];
95
96/*
97 * Pool set entries are indexed by item_len rounded to the next power of
98 * 2. A pool set can contain NULL pool entries, in which case the next
99 * large enough entry will be used for allocation.
100 */
101struct rseq_percpu_pool_set {
102 /* This lock protects add vs malloc/zmalloc within the pool set. */
103 pthread_mutex_t lock;
104 struct rseq_percpu_pool *entries[POOL_SET_NR_ENTRIES];
105};
106
367e559c
MD
107static
108void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool *pool, int cpu, uintptr_t item_offset)
109{
110 return pool->base + (pool->percpu_len * cpu) + item_offset;
111}
112
113void *__rseq_percpu_ptr(void *_ptr, int cpu)
114{
115 uintptr_t ptr = (uintptr_t) _ptr;
72b100a1
MD
116 uintptr_t item_offset = ptr & MAX_POOL_LEN_MASK;
117 uintptr_t pool_index = ptr >> POOL_INDEX_SHIFT;
367e559c
MD
118 struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
119
120 assert(cpu >= 0);
121 return __rseq_pool_percpu_ptr(pool, cpu, item_offset);
122}
123
124static
125void rseq_percpu_zero_item(struct rseq_percpu_pool *pool, uintptr_t item_offset)
126{
127 int i;
128
129 for (i = 0; i < pool->max_nr_cpus; i++) {
130 char *p = __rseq_pool_percpu_ptr(pool, i, item_offset);
131 memset(p, 0, pool->item_len);
132 }
133}
134
135#ifdef HAVE_LIBNUMA
136static
137void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool,
138 int numa_flags)
139{
140 unsigned long nr_pages, page;
141 long ret, page_len;
142 int cpu;
143
144 if (!numa_flags)
145 return;
146 page_len = rseq_get_page_len();
19be9217 147 nr_pages = pool->percpu_len >> rseq_get_count_order_ulong(page_len);
367e559c
MD
148 for (cpu = 0; cpu < pool->max_nr_cpus; cpu++) {
149 int node = numa_node_of_cpu(cpu);
150
151 /* TODO: batch move_pages() call with an array of pages. */
152 for (page = 0; page < nr_pages; page++) {
153 void *pageptr = __rseq_pool_percpu_ptr(pool, cpu, page * page_len);
154 int status = -EPERM;
155
156 ret = move_pages(0, 1, &pageptr, &node, &status, numa_flags);
157 if (ret) {
158 perror("move_pages");
159 abort();
160 }
161 }
162 }
163}
164#else
165static
166void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool __attribute__((unused)),
167 int numa_flags __attribute__((unused)))
168{
169}
170#endif
171
172/*
173 * Expected numa_flags:
174 * 0: do not move pages to specific numa nodes (use for e.g. mm_cid indexing).
175 * MPOL_MF_MOVE: move process-private pages to cpu-specific numa nodes.
176 * MPOL_MF_MOVE_ALL: move shared pages to cpu-specific numa nodes (requires CAP_SYS_NICE).
177 */
ef6695f1
MD
178struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len,
179 size_t percpu_len, int max_nr_cpus,
367e559c
MD
180 int mmap_prot, int mmap_flags, int mmap_fd,
181 off_t mmap_offset, int numa_flags)
ef6695f1
MD
182{
183 struct rseq_percpu_pool *pool;
184 void *base;
185 unsigned int i;
186 int order;
ef6695f1
MD
187
188 /* Make sure each item is large enough to contain free list pointers. */
189 if (item_len < sizeof(void *))
190 item_len = sizeof(void *);
191
192 /* Align item_len on next power of two. */
19be9217 193 order = rseq_get_count_order_ulong(item_len);
ef6695f1
MD
194 if (order < 0) {
195 errno = EINVAL;
196 return NULL;
197 }
198 item_len = 1UL << order;
199
200 /* Align percpu_len on page size. */
367e559c 201 percpu_len = rseq_align(percpu_len, rseq_get_page_len());
ef6695f1
MD
202
203 if (max_nr_cpus < 0 || item_len > percpu_len ||
72b100a1 204 percpu_len > (UINTPTR_MAX >> POOL_INDEX_BITS)) {
ef6695f1
MD
205 errno = EINVAL;
206 return NULL;
207 }
208
209 pthread_mutex_lock(&pool_lock);
210 /* Linear scan in array of pools to find empty spot. */
211 for (i = 0; i < MAX_NR_POOLS; i++) {
212 pool = &rseq_percpu_pool[i];
213 if (!pool->base)
214 goto found_empty;
215 }
216 errno = ENOMEM;
217 pool = NULL;
218 goto end;
219
220found_empty:
367e559c
MD
221 base = mmap(NULL, percpu_len * max_nr_cpus, mmap_prot,
222 mmap_flags, mmap_fd, mmap_offset);
ef6695f1
MD
223 if (base == MAP_FAILED) {
224 pool = NULL;
225 goto end;
226 }
367e559c 227 rseq_percpu_pool_init_numa(pool, numa_flags);
ef6695f1
MD
228 pthread_mutex_init(&pool->lock, NULL);
229 pool->base = base;
230 pool->percpu_len = percpu_len;
231 pool->max_nr_cpus = max_nr_cpus;
232 pool->index = i;
233 pool->item_len = item_len;
234 pool->item_order = order;
235end:
236 pthread_mutex_unlock(&pool_lock);
237 return pool;
238}
239
240int rseq_percpu_pool_destroy(struct rseq_percpu_pool *pool)
241{
242 int ret;
243
244 pthread_mutex_lock(&pool_lock);
245 if (!pool->base) {
246 errno = ENOENT;
247 ret = -1;
248 goto end;
249 }
250 ret = munmap(pool->base, pool->percpu_len * pool->max_nr_cpus);
251 if (ret)
252 goto end;
253 pthread_mutex_destroy(&pool->lock);
254 memset(pool, 0, sizeof(*pool));
255end:
256 pthread_mutex_unlock(&pool_lock);
257 return 0;
258}
259
ef6695f1
MD
260static
261void *__rseq_percpu_malloc(struct rseq_percpu_pool *pool, bool zeroed)
262{
263 struct free_list_node *node;
264 uintptr_t item_offset;
265 void *addr;
266
267 pthread_mutex_lock(&pool->lock);
268 /* Get first entry from free list. */
269 node = pool->free_list_head;
270 if (node != NULL) {
271 /* Remove node from free list (update head). */
272 pool->free_list_head = node->next;
273 item_offset = (uintptr_t) ((void *) node - pool->base);
72b100a1 274 addr = (void *) (((uintptr_t) pool->index << POOL_INDEX_SHIFT) | item_offset);
ef6695f1
MD
275 goto end;
276 }
277 if (pool->next_unused + pool->item_len > pool->percpu_len) {
278 addr = NULL;
279 goto end;
280 }
281 item_offset = pool->next_unused;
72b100a1 282 addr = (void *) (((uintptr_t) pool->index << POOL_INDEX_SHIFT) | item_offset);
ef6695f1
MD
283 pool->next_unused += pool->item_len;
284end:
285 pthread_mutex_unlock(&pool->lock);
286 if (zeroed && addr)
287 rseq_percpu_zero_item(pool, item_offset);
288 return addr;
289}
290
291void *rseq_percpu_malloc(struct rseq_percpu_pool *pool)
292{
293 return __rseq_percpu_malloc(pool, false);
294}
295
296void *rseq_percpu_zmalloc(struct rseq_percpu_pool *pool)
297{
298 return __rseq_percpu_malloc(pool, true);
299}
300
301void rseq_percpu_free(void *_ptr)
302{
303 uintptr_t ptr = (uintptr_t) _ptr;
72b100a1
MD
304 uintptr_t item_offset = ptr & MAX_POOL_LEN_MASK;
305 uintptr_t pool_index = ptr >> POOL_INDEX_SHIFT;
ef6695f1
MD
306 struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
307 struct free_list_node *head, *item;
308
309 pthread_mutex_lock(&pool->lock);
310 /* Add ptr to head of free list */
311 head = pool->free_list_head;
8ab16a24 312 /* Free-list is in CPU 0 range. */
ef6695f1
MD
313 item = (struct free_list_node *)__rseq_pool_percpu_ptr(pool, 0, item_offset);
314 item->next = head;
315 pool->free_list_head = item;
316 pthread_mutex_unlock(&pool->lock);
317}
318
319struct rseq_percpu_pool_set *rseq_percpu_pool_set_create(void)
320{
321 struct rseq_percpu_pool_set *pool_set;
322
323 pool_set = calloc(1, sizeof(struct rseq_percpu_pool_set));
324 if (!pool_set)
325 return NULL;
326 pthread_mutex_init(&pool_set->lock, NULL);
327 return pool_set;
328}
329
330int rseq_percpu_pool_set_destroy(struct rseq_percpu_pool_set *pool_set)
331{
332 int order, ret;
333
334 for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) {
335 struct rseq_percpu_pool *pool = pool_set->entries[order];
336
337 if (!pool)
338 continue;
339 ret = rseq_percpu_pool_destroy(pool);
340 if (ret)
341 return ret;
342 pool_set->entries[order] = NULL;
343 }
344 pthread_mutex_destroy(&pool_set->lock);
345 free(pool_set);
346 return 0;
347}
348
349/* Ownership of pool is handed over to pool set on success. */
350int rseq_percpu_pool_set_add_pool(struct rseq_percpu_pool_set *pool_set, struct rseq_percpu_pool *pool)
351{
352 size_t item_order = pool->item_order;
353 int ret = 0;
354
355 pthread_mutex_lock(&pool_set->lock);
356 if (pool_set->entries[item_order]) {
357 errno = EBUSY;
358 ret = -1;
359 goto end;
360 }
361 pool_set->entries[pool->item_order] = pool;
362end:
363 pthread_mutex_unlock(&pool_set->lock);
364 return ret;
365}
366
367static
368void *__rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len, bool zeroed)
369{
370 int order, min_order = POOL_SET_MIN_ENTRY;
371 struct rseq_percpu_pool *pool;
372 void *addr;
373
d06f5cf5
MD
374 order = rseq_get_count_order_ulong(len);
375 if (order > POOL_SET_MIN_ENTRY)
376 min_order = order;
ef6695f1
MD
377again:
378 pthread_mutex_lock(&pool_set->lock);
379 /* First smallest present pool where @len fits. */
380 for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) {
381 pool = pool_set->entries[order];
382
383 if (!pool)
384 continue;
385 if (pool->item_len >= len)
386 goto found;
387 }
388 pool = NULL;
389found:
390 pthread_mutex_unlock(&pool_set->lock);
391 if (pool) {
392 addr = __rseq_percpu_malloc(pool, zeroed);
393 if (addr == NULL && errno == ENOMEM) {
394 /*
395 * If the allocation failed, try again with a
396 * larger pool.
397 */
398 min_order = order + 1;
399 goto again;
400 }
401 } else {
402 /* Not found. */
403 errno = ENOMEM;
404 addr = NULL;
405 }
406 return addr;
407}
408
409void *rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len)
410{
411 return __rseq_percpu_pool_set_malloc(pool_set, len, false);
412}
413
414void *rseq_percpu_pool_set_zmalloc(struct rseq_percpu_pool_set *pool_set, size_t len)
415{
416 return __rseq_percpu_pool_set_malloc(pool_set, len, true);
417}
This page took 0.056552 seconds and 4 git commands to generate.