Implement per-cpu memory allocator
[librseq.git] / src / rseq-percpu-alloc.c
1 // SPDX-License-Identifier: MIT
2 // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3
4 #include <rseq/percpu-alloc.h>
5 #include <sys/mman.h>
6 #include <assert.h>
7 #include <string.h>
8 #include <pthread.h>
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <rseq/compiler.h>
12 #include <errno.h>
13 #include <stdint.h>
14 #include <stdbool.h>
15
16 /*
17 * rseq-percpu-alloc.c: rseq per-cpu memory allocator.
18 *
19 * The rseq per-cpu memory allocator allows the application the request
20 * memory pools of per-cpu memory each of a given virtual address size
21 * per-cpu, for a given maximum number of CPUs.
22 */
23
24 /* Use lowest bits of per-cpu addresses to index the pool. */
25 #if RSEQ_BITS_PER_LONG == 64
26 # define OFFSET_SHIFT 16
27 #else
28 # define OFFSET_SHIFT 8
29 #endif
30 #define MAX_NR_POOLS (1U << OFFSET_SHIFT)
31 #define POOL_MASK (MAX_NR_POOLS - 1)
32
33 #define POOL_SET_NR_ENTRIES (RSEQ_BITS_PER_LONG - OFFSET_SHIFT)
34 #define POOL_SET_MAX_ALLOC_LEN (1U << POOL_SET_NR_ENTRIES)
35
36 #if RSEQ_BITS_PER_LONG == 64
37 # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */
38 #else
39 # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */
40 #endif
41
42 #define DEFAULT_PAGE_SIZE 4096
43
44 struct free_list_node;
45
46 struct free_list_node {
47 struct free_list_node *next;
48 };
49
50 /* This lock protects pool create/destroy. */
51 static pthread_mutex_t pool_lock = PTHREAD_MUTEX_INITIALIZER;
52
53 struct rseq_percpu_pool {
54 void *base;
55 unsigned int index;
56 size_t item_len;
57 size_t percpu_len;
58 int item_order;
59 int max_nr_cpus;
60
61 /*
62 * The free list chains freed items on the cpu 0 address range.
63 * We should rethink this decision if false sharing between
64 * malloc/free from other cpus and data accesses from cpu 0
65 * becomes an issue. This is a NULL-terminated singly-linked
66 * list.
67 */
68 struct free_list_node *free_list_head;
69 size_t next_unused;
70 /* This lock protects allocation/free within the pool. */
71 pthread_mutex_t lock;
72 };
73
74 //TODO: the array of pools should grow dynamically on create.
75 static struct rseq_percpu_pool rseq_percpu_pool[MAX_NR_POOLS];
76
77 /*
78 * Pool set entries are indexed by item_len rounded to the next power of
79 * 2. A pool set can contain NULL pool entries, in which case the next
80 * large enough entry will be used for allocation.
81 */
82 struct rseq_percpu_pool_set {
83 /* This lock protects add vs malloc/zmalloc within the pool set. */
84 pthread_mutex_t lock;
85 struct rseq_percpu_pool *entries[POOL_SET_NR_ENTRIES];
86 };
87
88 #define __rseq_align_mask(v, mask) (((v) + (mask)) & ~(mask))
89 #define rseq_align(v, align) __rseq_align_mask(v, (__typeof__(v)) (align) - 1)
90
91 static __attribute__((unused))
92 unsigned int fls_u64(uint64_t x)
93 {
94 unsigned int r = 64;
95
96 if (!x)
97 return 0;
98
99 if (!(x & 0xFFFFFFFF00000000ULL)) {
100 x <<= 32;
101 r -= 32;
102 }
103 if (!(x & 0xFFFF000000000000ULL)) {
104 x <<= 16;
105 r -= 16;
106 }
107 if (!(x & 0xFF00000000000000ULL)) {
108 x <<= 8;
109 r -= 8;
110 }
111 if (!(x & 0xF000000000000000ULL)) {
112 x <<= 4;
113 r -= 4;
114 }
115 if (!(x & 0xC000000000000000ULL)) {
116 x <<= 2;
117 r -= 2;
118 }
119 if (!(x & 0x8000000000000000ULL)) {
120 x <<= 1;
121 r -= 1;
122 }
123 return r;
124 }
125
126 static __attribute__((unused))
127 unsigned int fls_u32(uint32_t x)
128 {
129 unsigned int r = 32;
130
131 if (!x)
132 return 0;
133 if (!(x & 0xFFFF0000U)) {
134 x <<= 16;
135 r -= 16;
136 }
137 if (!(x & 0xFF000000U)) {
138 x <<= 8;
139 r -= 8;
140 }
141 if (!(x & 0xF0000000U)) {
142 x <<= 4;
143 r -= 4;
144 }
145 if (!(x & 0xC0000000U)) {
146 x <<= 2;
147 r -= 2;
148 }
149 if (!(x & 0x80000000U)) {
150 x <<= 1;
151 r -= 1;
152 }
153 return r;
154 }
155
156 static
157 unsigned int fls_ulong(unsigned long x)
158 {
159 #if RSEQ_BITS_PER_LONG == 32
160 return fls_u32(x);
161 #else
162 return fls_u64(x);
163 #endif
164 }
165
166 /*
167 * Return the minimum order for which x <= (1UL << order).
168 * Return -1 if x is 0.
169 */
170 static
171 int get_count_order_ulong(unsigned long x)
172 {
173 if (!x)
174 return -1;
175
176 return fls_ulong(x - 1);
177 }
178
179 struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len,
180 size_t percpu_len, int max_nr_cpus,
181 int prot, int flags, int fd, off_t offset)
182 {
183 struct rseq_percpu_pool *pool;
184 void *base;
185 unsigned int i;
186 int order;
187 long page_len;
188
189 /* Make sure each item is large enough to contain free list pointers. */
190 if (item_len < sizeof(void *))
191 item_len = sizeof(void *);
192
193 /* Align item_len on next power of two. */
194 order = get_count_order_ulong(item_len);
195 if (order < 0) {
196 errno = EINVAL;
197 return NULL;
198 }
199 item_len = 1UL << order;
200
201 /* Align percpu_len on page size. */
202 page_len = sysconf(_SC_PAGE_SIZE);
203 if (page_len < 0)
204 page_len = DEFAULT_PAGE_SIZE;
205 percpu_len = rseq_align(percpu_len, page_len);
206
207 if (max_nr_cpus < 0 || item_len > percpu_len ||
208 percpu_len > (UINTPTR_MAX >> OFFSET_SHIFT)) {
209 errno = EINVAL;
210 return NULL;
211 }
212
213 pthread_mutex_lock(&pool_lock);
214 /* Linear scan in array of pools to find empty spot. */
215 for (i = 0; i < MAX_NR_POOLS; i++) {
216 pool = &rseq_percpu_pool[i];
217 if (!pool->base)
218 goto found_empty;
219 }
220 errno = ENOMEM;
221 pool = NULL;
222 goto end;
223
224 found_empty:
225 base = mmap(NULL, percpu_len * max_nr_cpus, prot, flags, fd, offset);
226 if (base == MAP_FAILED) {
227 pool = NULL;
228 goto end;
229 }
230 // TODO: integrate with libnuma to provide NUMA placement hints.
231 // See move_pages(2).
232 pthread_mutex_init(&pool->lock, NULL);
233 pool->base = base;
234 pool->percpu_len = percpu_len;
235 pool->max_nr_cpus = max_nr_cpus;
236 pool->index = i;
237 pool->item_len = item_len;
238 pool->item_order = order;
239 end:
240 pthread_mutex_unlock(&pool_lock);
241 return pool;
242 }
243
244 int rseq_percpu_pool_destroy(struct rseq_percpu_pool *pool)
245 {
246 int ret;
247
248 pthread_mutex_lock(&pool_lock);
249 if (!pool->base) {
250 errno = ENOENT;
251 ret = -1;
252 goto end;
253 }
254 ret = munmap(pool->base, pool->percpu_len * pool->max_nr_cpus);
255 if (ret)
256 goto end;
257 pthread_mutex_destroy(&pool->lock);
258 memset(pool, 0, sizeof(*pool));
259 end:
260 pthread_mutex_unlock(&pool_lock);
261 return 0;
262 }
263
264 static
265 void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool *pool, int cpu, uintptr_t item_offset)
266 {
267 return pool->base + (pool->percpu_len * cpu) + item_offset;
268 }
269
270 void *__rseq_percpu_ptr(void *_ptr, int cpu)
271 {
272 uintptr_t ptr = (uintptr_t) _ptr;
273 uintptr_t item_offset = ptr >> OFFSET_SHIFT;
274 uintptr_t pool_index = ptr & POOL_MASK;
275 struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
276
277 assert(cpu >= 0);
278 return __rseq_pool_percpu_ptr(pool, cpu, item_offset);
279 }
280
281 static
282 void rseq_percpu_zero_item(struct rseq_percpu_pool *pool, uintptr_t item_offset)
283 {
284 int i;
285
286 for (i = 0; i < pool->max_nr_cpus; i++) {
287 char *p = __rseq_pool_percpu_ptr(pool, i, item_offset);
288 memset(p, 0, pool->item_len);
289 }
290 }
291
292 static
293 void *__rseq_percpu_malloc(struct rseq_percpu_pool *pool, bool zeroed)
294 {
295 struct free_list_node *node;
296 uintptr_t item_offset;
297 void *addr;
298
299 pthread_mutex_lock(&pool->lock);
300 /* Get first entry from free list. */
301 node = pool->free_list_head;
302 if (node != NULL) {
303 /* Remove node from free list (update head). */
304 pool->free_list_head = node->next;
305 item_offset = (uintptr_t) ((void *) node - pool->base);
306 addr = (void *) ((item_offset << OFFSET_SHIFT) | pool->index);
307 goto end;
308 }
309 if (pool->next_unused + pool->item_len > pool->percpu_len) {
310 addr = NULL;
311 goto end;
312 }
313 item_offset = pool->next_unused;
314 addr = (void *) ((item_offset << OFFSET_SHIFT) | pool->index);
315 pool->next_unused += pool->item_len;
316 end:
317 pthread_mutex_unlock(&pool->lock);
318 if (zeroed && addr)
319 rseq_percpu_zero_item(pool, item_offset);
320 return addr;
321 }
322
323 void *rseq_percpu_malloc(struct rseq_percpu_pool *pool)
324 {
325 return __rseq_percpu_malloc(pool, false);
326 }
327
328 void *rseq_percpu_zmalloc(struct rseq_percpu_pool *pool)
329 {
330 return __rseq_percpu_malloc(pool, true);
331 }
332
333 void rseq_percpu_free(void *_ptr)
334 {
335 uintptr_t ptr = (uintptr_t) _ptr;
336 uintptr_t item_offset = ptr >> OFFSET_SHIFT;
337 uintptr_t pool_index = ptr & POOL_MASK;
338 struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index];
339 struct free_list_node *head, *item;
340
341 pthread_mutex_lock(&pool->lock);
342 /* Add ptr to head of free list */
343 head = pool->free_list_head;
344 /* Free-list is in cpu 0 range. */
345 item = (struct free_list_node *)__rseq_pool_percpu_ptr(pool, 0, item_offset);
346 item->next = head;
347 pool->free_list_head = item;
348 pthread_mutex_unlock(&pool->lock);
349 }
350
351 struct rseq_percpu_pool_set *rseq_percpu_pool_set_create(void)
352 {
353 struct rseq_percpu_pool_set *pool_set;
354
355 pool_set = calloc(1, sizeof(struct rseq_percpu_pool_set));
356 if (!pool_set)
357 return NULL;
358 pthread_mutex_init(&pool_set->lock, NULL);
359 return pool_set;
360 }
361
362 int rseq_percpu_pool_set_destroy(struct rseq_percpu_pool_set *pool_set)
363 {
364 int order, ret;
365
366 for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) {
367 struct rseq_percpu_pool *pool = pool_set->entries[order];
368
369 if (!pool)
370 continue;
371 ret = rseq_percpu_pool_destroy(pool);
372 if (ret)
373 return ret;
374 pool_set->entries[order] = NULL;
375 }
376 pthread_mutex_destroy(&pool_set->lock);
377 free(pool_set);
378 return 0;
379 }
380
381 /* Ownership of pool is handed over to pool set on success. */
382 int rseq_percpu_pool_set_add_pool(struct rseq_percpu_pool_set *pool_set, struct rseq_percpu_pool *pool)
383 {
384 size_t item_order = pool->item_order;
385 int ret = 0;
386
387 pthread_mutex_lock(&pool_set->lock);
388 if (pool_set->entries[item_order]) {
389 errno = EBUSY;
390 ret = -1;
391 goto end;
392 }
393 pool_set->entries[pool->item_order] = pool;
394 end:
395 pthread_mutex_unlock(&pool_set->lock);
396 return ret;
397 }
398
399 static
400 void *__rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len, bool zeroed)
401 {
402 int order, min_order = POOL_SET_MIN_ENTRY;
403 struct rseq_percpu_pool *pool;
404 void *addr;
405
406 again:
407 pthread_mutex_lock(&pool_set->lock);
408 /* First smallest present pool where @len fits. */
409 for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) {
410 pool = pool_set->entries[order];
411
412 if (!pool)
413 continue;
414 if (pool->item_len >= len)
415 goto found;
416 }
417 pool = NULL;
418 found:
419 pthread_mutex_unlock(&pool_set->lock);
420 if (pool) {
421 addr = __rseq_percpu_malloc(pool, zeroed);
422 if (addr == NULL && errno == ENOMEM) {
423 /*
424 * If the allocation failed, try again with a
425 * larger pool.
426 */
427 min_order = order + 1;
428 goto again;
429 }
430 } else {
431 /* Not found. */
432 errno = ENOMEM;
433 addr = NULL;
434 }
435 return addr;
436 }
437
438 void *rseq_percpu_pool_set_malloc(struct rseq_percpu_pool_set *pool_set, size_t len)
439 {
440 return __rseq_percpu_pool_set_malloc(pool_set, len, false);
441 }
442
443 void *rseq_percpu_pool_set_zmalloc(struct rseq_percpu_pool_set *pool_set, size_t len)
444 {
445 return __rseq_percpu_pool_set_malloc(pool_set, len, true);
446 }
This page took 0.063669 seconds and 4 git commands to generate.