Commit | Line | Data |
---|---|---|
ef6695f1 MD |
1 | // SPDX-License-Identifier: MIT |
2 | // SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> | |
cabbbc8e | 3 | // SPDX-FileCopyrightText: 2024 Olivier Dion <odion@efficios.com> |
ef6695f1 | 4 | |
34337fec | 5 | #include <rseq/mempool.h> |
ef6695f1 MD |
6 | #include <sys/mman.h> |
7 | #include <assert.h> | |
8 | #include <string.h> | |
9 | #include <pthread.h> | |
10 | #include <unistd.h> | |
11 | #include <stdlib.h> | |
12 | #include <rseq/compiler.h> | |
13 | #include <errno.h> | |
14 | #include <stdint.h> | |
15 | #include <stdbool.h> | |
367e559c | 16 | #include <stdio.h> |
a5694a4d | 17 | #include <fcntl.h> |
367e559c MD |
18 | |
19 | #ifdef HAVE_LIBNUMA | |
20 | # include <numa.h> | |
21 | # include <numaif.h> | |
22 | #endif | |
ef6695f1 | 23 | |
34337fec | 24 | #include "rseq-utils.h" |
252f9411 | 25 | #include "list.h" |
47c725dd | 26 | #include <rseq/rseq.h> |
19be9217 | 27 | |
ef6695f1 | 28 | /* |
b73b0c25 | 29 | * rseq-mempool.c: rseq CPU-Local Storage (CLS) memory allocator. |
ef6695f1 | 30 | * |
8ab16a24 MD |
31 | * The rseq per-CPU memory allocator allows the application the request |
32 | * memory pools of CPU-Local memory each of containing objects of a | |
8aa1462d MD |
33 | * given size (rounded to next power of 2), reserving a given virtual |
34 | * address size per CPU, for a given maximum number of CPUs. | |
8ab16a24 MD |
35 | * |
36 | * The per-CPU memory allocator is analogous to TLS (Thread-Local | |
37 | * Storage) memory: TLS is Thread-Local Storage, whereas the per-CPU | |
38 | * memory allocator provides CPU-Local Storage. | |
ef6695f1 MD |
39 | */ |
40 | ||
3236da62 | 41 | #define POOL_SET_NR_ENTRIES RSEQ_BITS_PER_LONG |
ef6695f1 | 42 | |
1a426b47 MD |
43 | #define POOL_HEADER_NR_PAGES 2 |
44 | ||
72b100a1 MD |
45 | /* |
46 | * Smallest allocation should hold enough space for a free list pointer. | |
47 | */ | |
ef6695f1 MD |
48 | #if RSEQ_BITS_PER_LONG == 64 |
49 | # define POOL_SET_MIN_ENTRY 3 /* Smallest item_len=8 */ | |
50 | #else | |
51 | # define POOL_SET_MIN_ENTRY 2 /* Smallest item_len=4 */ | |
52 | #endif | |
53 | ||
0fdf7a4c OD |
54 | #define BIT_PER_ULONG (8 * sizeof(unsigned long)) |
55 | ||
57d8b586 OD |
56 | #define MOVE_PAGES_BATCH_SIZE 4096 |
57 | ||
0ba2a93e | 58 | #define RANGE_HEADER_OFFSET sizeof(struct rseq_mempool_range) |
4aa3220c | 59 | |
3975084e | 60 | #if RSEQ_BITS_PER_LONG == 64 |
6ea98a7b | 61 | # define DEFAULT_COW_INIT_POISON_VALUE 0x5555555555555555ULL |
3975084e | 62 | #else |
6ea98a7b | 63 | # define DEFAULT_COW_INIT_POISON_VALUE 0x55555555UL |
3975084e MD |
64 | #endif |
65 | ||
805d0043 MD |
66 | /* |
67 | * Define the default COW_ZERO poison value as zero to prevent useless | |
68 | * COW page allocation when writing poison values when freeing items. | |
69 | */ | |
6ea98a7b MD |
70 | #define DEFAULT_COW_ZERO_POISON_VALUE 0x0 |
71 | ||
ef6695f1 MD |
72 | struct free_list_node; |
73 | ||
74 | struct free_list_node { | |
75 | struct free_list_node *next; | |
76 | }; | |
77 | ||
cb475906 | 78 | enum mempool_type { |
fffc02aa MD |
79 | MEMPOOL_TYPE_PERCPU = 0, /* Default */ |
80 | MEMPOOL_TYPE_GLOBAL = 1, | |
cb475906 MD |
81 | }; |
82 | ||
0ba2a93e | 83 | struct rseq_mempool_attr { |
135811f2 | 84 | bool init_set; |
6e329183 | 85 | int (*init_func)(void *priv, void *addr, size_t len, int cpu); |
135811f2 MD |
86 | void *init_priv; |
87 | ||
d6acc8aa | 88 | bool robust_set; |
cb475906 MD |
89 | |
90 | enum mempool_type type; | |
91 | size_t stride; | |
92 | int max_nr_cpus; | |
e11a02d7 MD |
93 | |
94 | unsigned long max_nr_ranges; | |
455e090e MD |
95 | |
96 | bool poison_set; | |
97 | uintptr_t poison; | |
a5694a4d MD |
98 | |
99 | enum rseq_mempool_populate_policy populate_policy; | |
9bd07c29 MD |
100 | }; |
101 | ||
0ba2a93e | 102 | struct rseq_mempool_range; |
b73b0c25 | 103 | |
0ba2a93e | 104 | struct rseq_mempool_range { |
252f9411 | 105 | struct list_head node; /* Linked list of ranges. */ |
9d986353 | 106 | struct rseq_mempool *pool; /* Backward reference to container pool. */ |
a5694a4d MD |
107 | |
108 | /* | |
109 | * Memory layout of a mempool range: | |
805d0043 MD |
110 | * - Canary header page (for detection of destroy-after-fork of |
111 | * COW_INIT pool), | |
1a426b47 MD |
112 | * - Header page (contains struct rseq_mempool_range at the |
113 | * very end), | |
c0de0012 | 114 | * - Base of the per-cpu data, starting with CPU 0. |
805d0043 | 115 | * Aliases with free-list for non-robust COW_ZERO pool. |
a5694a4d MD |
116 | * - CPU 1, |
117 | * ... | |
118 | * - CPU max_nr_cpus - 1 | |
805d0043 MD |
119 | * - init values (only allocated for COW_INIT pool). |
120 | * Aliases with free-list for non-robust COW_INIT pool. | |
c0de0012 MD |
121 | * - free list (for robust pool). |
122 | * | |
123 | * The free list aliases the CPU 0 memory area for non-robust | |
805d0043 MD |
124 | * COW_ZERO pools. It aliases with init values for non-robust |
125 | * COW_INIT pools. It is located immediately after the init | |
126 | * values for robust pools. | |
a5694a4d | 127 | */ |
4aa3220c | 128 | void *header; |
ef6695f1 | 129 | void *base; |
a5694a4d MD |
130 | /* |
131 | * The init values contains malloc_init/zmalloc values. | |
805d0043 | 132 | * Pointer is NULL for RSEQ_MEMPOOL_POPULATE_COW_ZERO. |
a5694a4d MD |
133 | */ |
134 | void *init; | |
b73b0c25 | 135 | size_t next_unused; |
fa6a0fb3 MD |
136 | |
137 | /* Pool range mmap/munmap */ | |
138 | void *mmap_addr; | |
139 | size_t mmap_len; | |
140 | ||
ffea0dea MD |
141 | size_t allocated_items; |
142 | ||
b73b0c25 MD |
143 | /* Track alloc/free. */ |
144 | unsigned long *alloc_bitmap; | |
145 | }; | |
146 | ||
0ba2a93e | 147 | struct rseq_mempool { |
252f9411 | 148 | struct list_head range_list; /* Head of ranges linked-list. */ |
9d986353 | 149 | unsigned long nr_ranges; |
b73b0c25 | 150 | |
ef6695f1 | 151 | size_t item_len; |
ef6695f1 | 152 | int item_order; |
ef6695f1 MD |
153 | |
154 | /* | |
805d0043 MD |
155 | * COW_INIT non-robust pools: |
156 | * The free list chains freed items on the init | |
157 | * values address range. | |
158 | * | |
159 | * COW_ZERO non-robust pools: | |
160 | * The free list chains freed items on the CPU 0 | |
161 | * address range. We should rethink this | |
162 | * decision if false sharing between malloc/free | |
163 | * from other CPUs and data accesses from CPU 0 | |
164 | * becomes an issue. | |
165 | * | |
166 | * Robust pools: The free list chains freed items in the | |
167 | * address range dedicated for the free list. | |
168 | * | |
169 | * This is a NULL-terminated singly-linked list. | |
ef6695f1 MD |
170 | */ |
171 | struct free_list_node *free_list_head; | |
b73b0c25 | 172 | |
ef6695f1 MD |
173 | /* This lock protects allocation/free within the pool. */ |
174 | pthread_mutex_t lock; | |
9bd07c29 | 175 | |
0ba2a93e | 176 | struct rseq_mempool_attr attr; |
ca452fee | 177 | char *name; |
ef6695f1 MD |
178 | }; |
179 | ||
ef6695f1 MD |
180 | /* |
181 | * Pool set entries are indexed by item_len rounded to the next power of | |
182 | * 2. A pool set can contain NULL pool entries, in which case the next | |
183 | * large enough entry will be used for allocation. | |
184 | */ | |
0ba2a93e | 185 | struct rseq_mempool_set { |
ef6695f1 MD |
186 | /* This lock protects add vs malloc/zmalloc within the pool set. */ |
187 | pthread_mutex_t lock; | |
0ba2a93e | 188 | struct rseq_mempool *entries[POOL_SET_NR_ENTRIES]; |
ef6695f1 MD |
189 | }; |
190 | ||
86617384 MD |
191 | static |
192 | const char *get_pool_name(const struct rseq_mempool *pool) | |
193 | { | |
194 | return pool->name ? : "<anonymous>"; | |
195 | } | |
196 | ||
367e559c | 197 | static |
6fbf1fb6 | 198 | void *__rseq_pool_range_percpu_ptr(const struct rseq_mempool_range *range, int cpu, |
f2981623 | 199 | uintptr_t item_offset, size_t stride) |
367e559c | 200 | { |
15b63c9f | 201 | return range->base + (stride * cpu) + item_offset; |
367e559c MD |
202 | } |
203 | ||
a5694a4d MD |
204 | static |
205 | void *__rseq_pool_range_init_ptr(const struct rseq_mempool_range *range, | |
206 | uintptr_t item_offset) | |
207 | { | |
208 | if (!range->init) | |
209 | return NULL; | |
210 | return range->init + item_offset; | |
211 | } | |
212 | ||
213 | static | |
214 | void __rseq_percpu *__rseq_free_list_to_percpu_ptr(const struct rseq_mempool *pool, | |
215 | struct free_list_node *node) | |
216 | { | |
217 | void __rseq_percpu *p = (void __rseq_percpu *) node; | |
218 | ||
c0de0012 MD |
219 | if (pool->attr.robust_set) { |
220 | /* Skip cpus. */ | |
a5694a4d | 221 | p -= pool->attr.max_nr_cpus * pool->attr.stride; |
c0de0012 | 222 | /* Skip init values */ |
805d0043 | 223 | if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT) |
c0de0012 MD |
224 | p -= pool->attr.stride; |
225 | ||
226 | } else { | |
805d0043 MD |
227 | /* COW_INIT free list is in init values */ |
228 | if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT) | |
c0de0012 MD |
229 | p -= pool->attr.max_nr_cpus * pool->attr.stride; |
230 | } | |
a5694a4d MD |
231 | return p; |
232 | } | |
233 | ||
234 | static | |
235 | struct free_list_node *__rseq_percpu_to_free_list_ptr(const struct rseq_mempool *pool, | |
236 | void __rseq_percpu *p) | |
237 | { | |
c0de0012 MD |
238 | if (pool->attr.robust_set) { |
239 | /* Skip cpus. */ | |
a5694a4d | 240 | p += pool->attr.max_nr_cpus * pool->attr.stride; |
c0de0012 | 241 | /* Skip init values */ |
805d0043 | 242 | if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT) |
c0de0012 MD |
243 | p += pool->attr.stride; |
244 | ||
245 | } else { | |
805d0043 MD |
246 | /* COW_INIT free list is in init values */ |
247 | if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT) | |
c0de0012 MD |
248 | p += pool->attr.max_nr_cpus * pool->attr.stride; |
249 | } | |
a5694a4d MD |
250 | return (struct free_list_node *) p; |
251 | } | |
252 | ||
253 | static | |
14af0aa2 | 254 | intptr_t rseq_cmp_item(void *p, size_t item_len, intptr_t cmp_value, intptr_t *unexpected_value) |
a5694a4d | 255 | { |
14af0aa2 MD |
256 | size_t offset; |
257 | intptr_t res = 0; | |
a5694a4d | 258 | |
14af0aa2 MD |
259 | for (offset = 0; offset < item_len; offset += sizeof(uintptr_t)) { |
260 | intptr_t v = *((intptr_t *) (p + offset)); | |
261 | ||
262 | if ((res = v - cmp_value) != 0) { | |
263 | if (unexpected_value) | |
264 | *unexpected_value = v; | |
a5694a4d | 265 | break; |
14af0aa2 MD |
266 | } |
267 | } | |
a5694a4d MD |
268 | return res; |
269 | } | |
270 | ||
367e559c | 271 | static |
15b63c9f MD |
272 | void rseq_percpu_zero_item(struct rseq_mempool *pool, |
273 | struct rseq_mempool_range *range, uintptr_t item_offset) | |
367e559c | 274 | { |
a5694a4d | 275 | char *init_p = NULL; |
367e559c MD |
276 | int i; |
277 | ||
a5694a4d MD |
278 | init_p = __rseq_pool_range_init_ptr(range, item_offset); |
279 | if (init_p) | |
644298bb | 280 | bzero(init_p, pool->item_len); |
cb475906 | 281 | for (i = 0; i < pool->attr.max_nr_cpus; i++) { |
15b63c9f | 282 | char *p = __rseq_pool_range_percpu_ptr(range, i, |
cb475906 | 283 | item_offset, pool->attr.stride); |
a5694a4d | 284 | |
1b658191 MD |
285 | /* |
286 | * If item is already zeroed, either because the | |
287 | * init range update has propagated or because the | |
288 | * content is already zeroed (e.g. zero page), don't | |
289 | * write to the page. This eliminates useless COW over | |
290 | * the zero page just for overwriting it with zeroes. | |
291 | * | |
805d0043 | 292 | * This means zmalloc() in COW_ZERO policy pool do |
1b658191 MD |
293 | * not trigger COW for CPUs which are not actively |
294 | * writing to the pool. This is however not the case for | |
295 | * malloc_init() in populate-all pools if it populates | |
296 | * non-zero content. | |
297 | */ | |
14af0aa2 | 298 | if (!rseq_cmp_item(p, pool->item_len, 0, NULL)) |
a5694a4d | 299 | continue; |
644298bb | 300 | bzero(p, pool->item_len); |
367e559c MD |
301 | } |
302 | } | |
303 | ||
6ff43d9a MD |
304 | static |
305 | void rseq_percpu_init_item(struct rseq_mempool *pool, | |
306 | struct rseq_mempool_range *range, uintptr_t item_offset, | |
307 | void *init_ptr, size_t init_len) | |
308 | { | |
a5694a4d | 309 | char *init_p = NULL; |
6ff43d9a MD |
310 | int i; |
311 | ||
a5694a4d MD |
312 | init_p = __rseq_pool_range_init_ptr(range, item_offset); |
313 | if (init_p) | |
314 | memcpy(init_p, init_ptr, init_len); | |
6ff43d9a MD |
315 | for (i = 0; i < pool->attr.max_nr_cpus; i++) { |
316 | char *p = __rseq_pool_range_percpu_ptr(range, i, | |
317 | item_offset, pool->attr.stride); | |
a5694a4d | 318 | |
1b658191 MD |
319 | /* |
320 | * If the update propagated through a shared mapping, | |
321 | * or the item already has the correct content, skip | |
322 | * writing it into the cpu item to eliminate useless | |
323 | * COW of the page. | |
324 | */ | |
325 | if (!memcmp(init_ptr, p, init_len)) | |
a5694a4d | 326 | continue; |
6ff43d9a MD |
327 | memcpy(p, init_ptr, init_len); |
328 | } | |
329 | } | |
330 | ||
a5694a4d MD |
331 | static |
332 | void rseq_poison_item(void *p, size_t item_len, uintptr_t poison) | |
333 | { | |
334 | size_t offset; | |
335 | ||
336 | for (offset = 0; offset < item_len; offset += sizeof(uintptr_t)) | |
337 | *((uintptr_t *) (p + offset)) = poison; | |
338 | } | |
339 | ||
455e090e MD |
340 | static |
341 | void rseq_percpu_poison_item(struct rseq_mempool *pool, | |
342 | struct rseq_mempool_range *range, uintptr_t item_offset) | |
343 | { | |
344 | uintptr_t poison = pool->attr.poison; | |
a5694a4d | 345 | char *init_p = NULL; |
455e090e MD |
346 | int i; |
347 | ||
a5694a4d MD |
348 | init_p = __rseq_pool_range_init_ptr(range, item_offset); |
349 | if (init_p) | |
350 | rseq_poison_item(init_p, pool->item_len, poison); | |
455e090e MD |
351 | for (i = 0; i < pool->attr.max_nr_cpus; i++) { |
352 | char *p = __rseq_pool_range_percpu_ptr(range, i, | |
353 | item_offset, pool->attr.stride); | |
455e090e | 354 | |
1b658191 MD |
355 | /* |
356 | * If the update propagated through a shared mapping, | |
357 | * or the item already has the correct content, skip | |
358 | * writing it into the cpu item to eliminate useless | |
359 | * COW of the page. | |
360 | * | |
361 | * It is recommended to use zero as poison value for | |
805d0043 MD |
362 | * COW_ZERO pools to eliminate COW due to writing |
363 | * poison to CPU memory still backed by the zero page. | |
1b658191 | 364 | */ |
14af0aa2 | 365 | if (rseq_cmp_item(p, pool->item_len, poison, NULL) == 0) |
a5694a4d MD |
366 | continue; |
367 | rseq_poison_item(p, pool->item_len, poison); | |
368 | } | |
369 | } | |
370 | ||
371 | /* Always inline for __builtin_return_address(0). */ | |
372 | static inline __attribute__((always_inline)) | |
373 | void rseq_check_poison_item(const struct rseq_mempool *pool, uintptr_t item_offset, | |
c0de0012 | 374 | void *p, size_t item_len, uintptr_t poison) |
a5694a4d | 375 | { |
1b658191 | 376 | intptr_t unexpected_value; |
a5694a4d | 377 | |
14af0aa2 | 378 | if (rseq_cmp_item(p, item_len, poison, &unexpected_value) == 0) |
1b658191 | 379 | return; |
a5694a4d | 380 | |
1b658191 MD |
381 | fprintf(stderr, "%s: Poison corruption detected (0x%lx) for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n", |
382 | __func__, (unsigned long) unexpected_value, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0)); | |
383 | abort(); | |
86617384 MD |
384 | } |
385 | ||
386 | /* Always inline for __builtin_return_address(0). */ | |
387 | static inline __attribute__((always_inline)) | |
6fbf1fb6 MD |
388 | void rseq_percpu_check_poison_item(const struct rseq_mempool *pool, |
389 | const struct rseq_mempool_range *range, uintptr_t item_offset) | |
86617384 MD |
390 | { |
391 | uintptr_t poison = pool->attr.poison; | |
a5694a4d | 392 | char *init_p; |
86617384 MD |
393 | int i; |
394 | ||
3975084e | 395 | if (!pool->attr.robust_set) |
86617384 | 396 | return; |
a5694a4d MD |
397 | init_p = __rseq_pool_range_init_ptr(range, item_offset); |
398 | if (init_p) | |
c0de0012 | 399 | rseq_check_poison_item(pool, item_offset, init_p, pool->item_len, poison); |
86617384 MD |
400 | for (i = 0; i < pool->attr.max_nr_cpus; i++) { |
401 | char *p = __rseq_pool_range_percpu_ptr(range, i, | |
402 | item_offset, pool->attr.stride); | |
c0de0012 | 403 | rseq_check_poison_item(pool, item_offset, p, pool->item_len, poison); |
455e090e MD |
404 | } |
405 | } | |
406 | ||
15b63c9f | 407 | #ifdef HAVE_LIBNUMA |
c6fd3981 | 408 | int rseq_mempool_range_init_numa(void *addr, size_t len, int cpu, int numa_flags) |
367e559c | 409 | { |
f2981623 | 410 | unsigned long nr_pages, page_len; |
c6fd3981 MD |
411 | int status[MOVE_PAGES_BATCH_SIZE]; |
412 | int nodes[MOVE_PAGES_BATCH_SIZE]; | |
413 | void *pages[MOVE_PAGES_BATCH_SIZE]; | |
f2981623 | 414 | long ret; |
367e559c | 415 | |
c6fd3981 MD |
416 | if (!numa_flags) { |
417 | errno = EINVAL; | |
418 | return -1; | |
419 | } | |
367e559c | 420 | page_len = rseq_get_page_len(); |
c6fd3981 | 421 | nr_pages = len >> rseq_get_count_order_ulong(page_len); |
57d8b586 | 422 | |
c6fd3981 MD |
423 | nodes[0] = numa_node_of_cpu(cpu); |
424 | if (nodes[0] < 0) | |
425 | return -1; | |
57d8b586 | 426 | |
c6fd3981 MD |
427 | for (size_t k = 1; k < RSEQ_ARRAY_SIZE(nodes); ++k) { |
428 | nodes[k] = nodes[0]; | |
429 | } | |
57d8b586 | 430 | |
c6fd3981 | 431 | for (unsigned long page = 0; page < nr_pages;) { |
57d8b586 | 432 | |
c6fd3981 MD |
433 | size_t max_k = RSEQ_ARRAY_SIZE(pages); |
434 | size_t left = nr_pages - page; | |
57d8b586 | 435 | |
c6fd3981 MD |
436 | if (left < max_k) { |
437 | max_k = left; | |
438 | } | |
57d8b586 | 439 | |
c6fd3981 MD |
440 | for (size_t k = 0; k < max_k; ++k, ++page) { |
441 | pages[k] = addr + (page * page_len); | |
442 | status[k] = -EPERM; | |
367e559c | 443 | } |
b73b0c25 | 444 | |
c6fd3981 MD |
445 | ret = move_pages(0, max_k, pages, nodes, status, numa_flags); |
446 | ||
447 | if (ret < 0) | |
b73b0c25 | 448 | return ret; |
c6fd3981 MD |
449 | |
450 | if (ret > 0) { | |
451 | fprintf(stderr, "%lu pages were not migrated\n", ret); | |
452 | for (size_t k = 0; k < max_k; ++k) { | |
453 | if (status[k] < 0) | |
454 | fprintf(stderr, | |
455 | "Error while moving page %p to numa node %d: %u\n", | |
456 | pages[k], nodes[k], -status[k]); | |
457 | } | |
458 | } | |
b73b0c25 MD |
459 | } |
460 | return 0; | |
461 | } | |
367e559c | 462 | #else |
c6fd3981 MD |
463 | int rseq_mempool_range_init_numa(void *addr __attribute__((unused)), |
464 | size_t len __attribute__((unused)), | |
465 | int cpu __attribute__((unused)), | |
367e559c MD |
466 | int numa_flags __attribute__((unused))) |
467 | { | |
c6fd3981 MD |
468 | errno = ENOSYS; |
469 | return -1; | |
367e559c MD |
470 | } |
471 | #endif | |
472 | ||
0fdf7a4c | 473 | static |
0ba2a93e | 474 | int create_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range) |
0fdf7a4c OD |
475 | { |
476 | size_t count; | |
477 | ||
cb475906 | 478 | count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG; |
0fdf7a4c OD |
479 | |
480 | /* | |
9649c7ee MD |
481 | * Not being able to create the validation bitmap is an error |
482 | * that needs to be reported. | |
0fdf7a4c | 483 | */ |
b73b0c25 MD |
484 | range->alloc_bitmap = calloc(count, sizeof(unsigned long)); |
485 | if (!range->alloc_bitmap) | |
9649c7ee MD |
486 | return -1; |
487 | return 0; | |
0fdf7a4c OD |
488 | } |
489 | ||
b73b0c25 | 490 | static |
a5694a4d | 491 | bool percpu_addr_in_pool(const struct rseq_mempool *pool, void __rseq_percpu *_addr) |
b73b0c25 | 492 | { |
0ba2a93e | 493 | struct rseq_mempool_range *range; |
a5694a4d | 494 | void *addr = (void *) _addr; |
b73b0c25 | 495 | |
252f9411 | 496 | list_for_each_entry(range, &pool->range_list, node) { |
b73b0c25 MD |
497 | if (addr >= range->base && addr < range->base + range->next_unused) |
498 | return true; | |
499 | } | |
500 | return false; | |
501 | } | |
502 | ||
a9ec6111 OD |
503 | /* Always inline for __builtin_return_address(0). */ |
504 | static inline __attribute__((always_inline)) | |
1a426b47 | 505 | void check_free_list(const struct rseq_mempool *pool, bool mapping_accessible) |
a9ec6111 | 506 | { |
b73b0c25 MD |
507 | size_t total_item = 0, total_never_allocated = 0, total_freed = 0, |
508 | max_list_traversal = 0, traversal_iteration = 0; | |
0ba2a93e | 509 | struct rseq_mempool_range *range; |
b73b0c25 | 510 | |
1a426b47 | 511 | if (!pool->attr.robust_set || !mapping_accessible) |
b73b0c25 MD |
512 | return; |
513 | ||
252f9411 | 514 | list_for_each_entry(range, &pool->range_list, node) { |
cb475906 MD |
515 | total_item += pool->attr.stride >> pool->item_order; |
516 | total_never_allocated += (pool->attr.stride - range->next_unused) >> pool->item_order; | |
b73b0c25 MD |
517 | } |
518 | max_list_traversal = total_item - total_never_allocated; | |
a9ec6111 OD |
519 | |
520 | for (struct free_list_node *node = pool->free_list_head, *prev = NULL; | |
521 | node; | |
522 | prev = node, | |
523 | node = node->next) { | |
524 | ||
a9ec6111 | 525 | if (traversal_iteration >= max_list_traversal) { |
ca452fee MD |
526 | fprintf(stderr, "%s: Corrupted free-list; Possibly infinite loop in pool \"%s\" (%p), caller %p.\n", |
527 | __func__, get_pool_name(pool), pool, __builtin_return_address(0)); | |
a9ec6111 OD |
528 | abort(); |
529 | } | |
530 | ||
531 | /* Node is out of range. */ | |
a5694a4d | 532 | if (!percpu_addr_in_pool(pool, __rseq_free_list_to_percpu_ptr(pool, node))) { |
a9ec6111 | 533 | if (prev) |
ca452fee MD |
534 | fprintf(stderr, "%s: Corrupted free-list node %p -> [out-of-range %p] in pool \"%s\" (%p), caller %p.\n", |
535 | __func__, prev, node, get_pool_name(pool), pool, __builtin_return_address(0)); | |
a9ec6111 | 536 | else |
ca452fee MD |
537 | fprintf(stderr, "%s: Corrupted free-list node [out-of-range %p] in pool \"%s\" (%p), caller %p.\n", |
538 | __func__, node, get_pool_name(pool), pool, __builtin_return_address(0)); | |
a9ec6111 OD |
539 | abort(); |
540 | } | |
541 | ||
b73b0c25 MD |
542 | traversal_iteration++; |
543 | total_freed++; | |
a9ec6111 OD |
544 | } |
545 | ||
546 | if (total_never_allocated + total_freed != total_item) { | |
ca452fee MD |
547 | fprintf(stderr, "%s: Corrupted free-list in pool \"%s\" (%p); total-item: %zu total-never-used: %zu total-freed: %zu, caller %p.\n", |
548 | __func__, get_pool_name(pool), pool, total_item, total_never_allocated, total_freed, __builtin_return_address(0)); | |
a9ec6111 OD |
549 | abort(); |
550 | } | |
a9ec6111 OD |
551 | } |
552 | ||
6fbf1fb6 MD |
553 | /* Always inline for __builtin_return_address(0). */ |
554 | static inline __attribute__((always_inline)) | |
555 | void check_range_poison(const struct rseq_mempool *pool, | |
556 | const struct rseq_mempool_range *range) | |
557 | { | |
558 | size_t item_offset; | |
559 | ||
560 | for (item_offset = 0; item_offset < range->next_unused; | |
561 | item_offset += pool->item_len) | |
562 | rseq_percpu_check_poison_item(pool, range, item_offset); | |
563 | } | |
564 | ||
565 | /* Always inline for __builtin_return_address(0). */ | |
566 | static inline __attribute__((always_inline)) | |
1a426b47 | 567 | void check_pool_poison(const struct rseq_mempool *pool, bool mapping_accessible) |
6fbf1fb6 MD |
568 | { |
569 | struct rseq_mempool_range *range; | |
570 | ||
1a426b47 | 571 | if (!pool->attr.robust_set || !mapping_accessible) |
6fbf1fb6 | 572 | return; |
252f9411 | 573 | list_for_each_entry(range, &pool->range_list, node) |
6fbf1fb6 MD |
574 | check_range_poison(pool, range); |
575 | } | |
576 | ||
e7cbbc10 MD |
577 | /* Always inline for __builtin_return_address(0). */ |
578 | static inline __attribute__((always_inline)) | |
0ba2a93e | 579 | void destroy_alloc_bitmap(struct rseq_mempool *pool, struct rseq_mempool_range *range) |
0fdf7a4c | 580 | { |
b73b0c25 | 581 | unsigned long *bitmap = range->alloc_bitmap; |
9649c7ee | 582 | size_t count, total_leaks = 0; |
0fdf7a4c | 583 | |
9649c7ee | 584 | if (!bitmap) |
0fdf7a4c | 585 | return; |
0fdf7a4c | 586 | |
cb475906 | 587 | count = ((pool->attr.stride >> pool->item_order) + BIT_PER_ULONG - 1) / BIT_PER_ULONG; |
0fdf7a4c OD |
588 | |
589 | /* Assert that all items in the pool were freed. */ | |
9649c7ee MD |
590 | for (size_t k = 0; k < count; ++k) |
591 | total_leaks += rseq_hweight_ulong(bitmap[k]); | |
592 | if (total_leaks) { | |
ca452fee MD |
593 | fprintf(stderr, "%s: Pool \"%s\" (%p) has %zu leaked items on destroy, caller: %p.\n", |
594 | __func__, get_pool_name(pool), pool, total_leaks, (void *) __builtin_return_address(0)); | |
9649c7ee | 595 | abort(); |
0fdf7a4c OD |
596 | } |
597 | ||
598 | free(bitmap); | |
a5694a4d | 599 | range->alloc_bitmap = NULL; |
0fdf7a4c OD |
600 | } |
601 | ||
b73b0c25 MD |
602 | /* Always inline for __builtin_return_address(0). */ |
603 | static inline __attribute__((always_inline)) | |
0ba2a93e | 604 | int rseq_mempool_range_destroy(struct rseq_mempool *pool, |
1a426b47 MD |
605 | struct rseq_mempool_range *range, |
606 | bool mapping_accessible) | |
b73b0c25 MD |
607 | { |
608 | destroy_alloc_bitmap(pool, range); | |
1a426b47 MD |
609 | if (!mapping_accessible) { |
610 | /* | |
611 | * Only the header pages are populated in the child | |
612 | * process. | |
613 | */ | |
614 | return munmap(range->header, POOL_HEADER_NR_PAGES * rseq_get_page_len()); | |
615 | } | |
5cd72fc7 | 616 | return munmap(range->mmap_addr, range->mmap_len); |
5c99f3d6 MD |
617 | } |
618 | ||
619 | /* | |
620 | * Allocate a memory mapping aligned on @alignment, with an optional | |
621 | * @pre_header before the mapping. | |
622 | */ | |
623 | static | |
5cd72fc7 | 624 | void *aligned_mmap_anonymous(size_t page_size, size_t len, size_t alignment, |
5c99f3d6 MD |
625 | void **pre_header, size_t pre_header_len) |
626 | { | |
627 | size_t minimum_page_count, page_count, extra, total_allocate = 0; | |
628 | int page_order; | |
629 | void *ptr; | |
630 | ||
631 | if (len < page_size || alignment < page_size || | |
b72b2d9e | 632 | !is_pow2(alignment) || (len & (alignment - 1))) { |
5c99f3d6 MD |
633 | errno = EINVAL; |
634 | return NULL; | |
635 | } | |
636 | page_order = rseq_get_count_order_ulong(page_size); | |
637 | if (page_order < 0) { | |
638 | errno = EINVAL; | |
639 | return NULL; | |
640 | } | |
641 | if (pre_header_len && (pre_header_len & (page_size - 1))) { | |
642 | errno = EINVAL; | |
643 | return NULL; | |
644 | } | |
645 | ||
646 | minimum_page_count = (pre_header_len + len) >> page_order; | |
647 | page_count = (pre_header_len + len + alignment - page_size) >> page_order; | |
648 | ||
649 | assert(page_count >= minimum_page_count); | |
650 | ||
5cd72fc7 MD |
651 | ptr = mmap(NULL, page_count << page_order, PROT_READ | PROT_WRITE, |
652 | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); | |
653 | if (ptr == MAP_FAILED) { | |
654 | ptr = NULL; | |
5c99f3d6 | 655 | goto alloc_error; |
5cd72fc7 | 656 | } |
5c99f3d6 MD |
657 | |
658 | total_allocate = page_count << page_order; | |
659 | ||
660 | if (!(((uintptr_t) ptr + pre_header_len) & (alignment - 1))) { | |
661 | /* Pointer is already aligned. ptr points to pre_header. */ | |
662 | goto out; | |
663 | } | |
664 | ||
665 | /* Unmap extra before. */ | |
666 | extra = offset_align((uintptr_t) ptr + pre_header_len, alignment); | |
667 | assert(!(extra & (page_size - 1))); | |
5cd72fc7 | 668 | if (munmap(ptr, extra)) { |
5c99f3d6 MD |
669 | perror("munmap"); |
670 | abort(); | |
671 | } | |
672 | total_allocate -= extra; | |
673 | ptr += extra; /* ptr points to pre_header */ | |
674 | page_count -= extra >> page_order; | |
675 | out: | |
676 | assert(page_count >= minimum_page_count); | |
677 | ||
678 | if (page_count > minimum_page_count) { | |
679 | void *extra_ptr; | |
680 | ||
681 | /* Unmap extra after. */ | |
682 | extra_ptr = ptr + (minimum_page_count << page_order); | |
683 | extra = (page_count - minimum_page_count) << page_order; | |
5cd72fc7 | 684 | if (munmap(extra_ptr, extra)) { |
5c99f3d6 MD |
685 | perror("munmap"); |
686 | abort(); | |
687 | } | |
688 | total_allocate -= extra; | |
689 | } | |
690 | ||
691 | assert(!(((uintptr_t)ptr + pre_header_len) & (alignment - 1))); | |
692 | assert(total_allocate == len + pre_header_len); | |
693 | ||
694 | alloc_error: | |
695 | if (ptr) { | |
696 | if (pre_header) | |
697 | *pre_header = ptr; | |
698 | ptr += pre_header_len; | |
699 | } | |
700 | return ptr; | |
b73b0c25 MD |
701 | } |
702 | ||
a5694a4d | 703 | static |
cc0413ab | 704 | int rseq_memfd_create_init(const char *poolname, size_t init_len) |
a5694a4d | 705 | { |
a10c1c93 | 706 | int fd; |
cc0413ab OD |
707 | char buf[249]; /* Limit is 249 bytes. */ |
708 | const char *name; | |
a10c1c93 | 709 | |
cc0413ab OD |
710 | if (poolname) { |
711 | snprintf(buf, sizeof(buf), "%s:rseq-mempool", poolname); | |
712 | name = buf; | |
713 | } else { | |
714 | name = "<anonymous>:rseq-mempool"; | |
715 | } | |
716 | ||
717 | fd = memfd_create(name, MFD_CLOEXEC); | |
a10c1c93 MD |
718 | if (fd < 0) { |
719 | perror("memfd_create"); | |
720 | goto end; | |
a5694a4d | 721 | } |
a10c1c93 | 722 | if (ftruncate(fd, (off_t) init_len)) { |
025165ad MD |
723 | if (close(fd)) |
724 | perror("close"); | |
a10c1c93 MD |
725 | fd = -1; |
726 | goto end; | |
727 | } | |
728 | end: | |
729 | return fd; | |
730 | } | |
731 | ||
732 | static | |
733 | void rseq_memfd_close(int fd) | |
734 | { | |
025165ad MD |
735 | if (fd < 0) |
736 | return; | |
a10c1c93 MD |
737 | if (close(fd)) |
738 | perror("close"); | |
a5694a4d MD |
739 | } |
740 | ||
b73b0c25 | 741 | static |
0ba2a93e | 742 | struct rseq_mempool_range *rseq_mempool_range_create(struct rseq_mempool *pool) |
b73b0c25 | 743 | { |
0ba2a93e | 744 | struct rseq_mempool_range *range; |
5c99f3d6 | 745 | unsigned long page_size; |
4aa3220c | 746 | void *header; |
b73b0c25 | 747 | void *base; |
a5694a4d | 748 | size_t range_len; /* Range len excludes header. */ |
1a426b47 | 749 | size_t header_len; |
025165ad | 750 | int memfd = -1; |
b73b0c25 | 751 | |
e11a02d7 MD |
752 | if (pool->attr.max_nr_ranges && |
753 | pool->nr_ranges >= pool->attr.max_nr_ranges) { | |
9d986353 MD |
754 | errno = ENOMEM; |
755 | return NULL; | |
756 | } | |
5c99f3d6 | 757 | page_size = rseq_get_page_len(); |
b73b0c25 | 758 | |
1a426b47 | 759 | header_len = POOL_HEADER_NR_PAGES * page_size; |
a5694a4d | 760 | range_len = pool->attr.stride * pool->attr.max_nr_cpus; |
805d0043 | 761 | if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT) |
a5694a4d | 762 | range_len += pool->attr.stride; /* init values */ |
c0de0012 | 763 | if (pool->attr.robust_set) |
805d0043 | 764 | range_len += pool->attr.stride; /* dedicated free list */ |
5cd72fc7 | 765 | base = aligned_mmap_anonymous(page_size, range_len, |
1a426b47 | 766 | pool->attr.stride, &header, header_len); |
b73b0c25 | 767 | if (!base) |
5c99f3d6 | 768 | return NULL; |
0ba2a93e | 769 | range = (struct rseq_mempool_range *) (base - RANGE_HEADER_OFFSET); |
5c99f3d6 | 770 | range->pool = pool; |
4aa3220c | 771 | range->header = header; |
a5694a4d | 772 | range->base = base; |
fa6a0fb3 | 773 | range->mmap_addr = header; |
1a426b47 | 774 | range->mmap_len = header_len + range_len; |
a5694a4d | 775 | |
805d0043 | 776 | if (pool->attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT) { |
a5694a4d MD |
777 | range->init = base + (pool->attr.stride * pool->attr.max_nr_cpus); |
778 | /* Populate init values pages from memfd */ | |
cc0413ab | 779 | memfd = rseq_memfd_create_init(pool->name, pool->attr.stride); |
a10c1c93 | 780 | if (memfd < 0) |
a5694a4d MD |
781 | goto error_alloc; |
782 | if (mmap(range->init, pool->attr.stride, PROT_READ | PROT_WRITE, | |
1a426b47 | 783 | MAP_SHARED | MAP_FIXED, memfd, 0) != (void *) range->init) |
a5694a4d | 784 | goto error_alloc; |
a5694a4d MD |
785 | assert(pool->attr.type == MEMPOOL_TYPE_PERCPU); |
786 | /* | |
787 | * Map per-cpu memory as private COW mappings of init values. | |
788 | */ | |
789 | { | |
790 | int cpu; | |
791 | ||
792 | for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) { | |
793 | void *p = base + (pool->attr.stride * cpu); | |
794 | size_t len = pool->attr.stride; | |
795 | ||
796 | if (mmap(p, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED, | |
1a426b47 | 797 | memfd, 0) != (void *) p) |
a5694a4d | 798 | goto error_alloc; |
a5694a4d MD |
799 | } |
800 | } | |
bf7b01a3 MD |
801 | /* |
802 | * The init values shared mapping should not be shared | |
803 | * with the children processes across fork. Prevent the | |
804 | * whole mapping from being used across fork. | |
805 | */ | |
1a426b47 MD |
806 | if (madvise(base, range_len, MADV_DONTFORK)) |
807 | goto error_alloc; | |
808 | ||
809 | /* | |
810 | * Write 0x1 in first byte of header first page, which | |
811 | * will be WIPEONFORK (and thus cleared) in children | |
812 | * processes. Used to find out if pool destroy is called | |
813 | * from a child process after fork. | |
814 | */ | |
815 | *((char *) header) = 0x1; | |
816 | if (madvise(header, page_size, MADV_WIPEONFORK)) | |
bf7b01a3 | 817 | goto error_alloc; |
1a426b47 MD |
818 | |
819 | /* | |
820 | * The second header page contains the struct | |
821 | * rseq_mempool_range, which is needed by pool destroy. | |
822 | * Leave this anonymous page populated (COW) in child | |
823 | * processes. | |
824 | */ | |
a10c1c93 | 825 | rseq_memfd_close(memfd); |
025165ad | 826 | memfd = -1; |
a5694a4d MD |
827 | } |
828 | ||
b73b0c25 MD |
829 | if (pool->attr.robust_set) { |
830 | if (create_alloc_bitmap(pool, range)) | |
831 | goto error_alloc; | |
832 | } | |
135811f2 | 833 | if (pool->attr.init_set) { |
374c2773 MD |
834 | switch (pool->attr.type) { |
835 | case MEMPOOL_TYPE_GLOBAL: | |
6e329183 | 836 | if (pool->attr.init_func(pool->attr.init_priv, |
374c2773 | 837 | base, pool->attr.stride, -1)) { |
6e329183 MD |
838 | goto error_alloc; |
839 | } | |
374c2773 MD |
840 | break; |
841 | case MEMPOOL_TYPE_PERCPU: | |
842 | { | |
843 | int cpu; | |
844 | for (cpu = 0; cpu < pool->attr.max_nr_cpus; cpu++) { | |
845 | if (pool->attr.init_func(pool->attr.init_priv, | |
846 | base + (pool->attr.stride * cpu), | |
847 | pool->attr.stride, cpu)) { | |
848 | goto error_alloc; | |
849 | } | |
850 | } | |
851 | break; | |
852 | } | |
853 | default: | |
854 | abort(); | |
135811f2 MD |
855 | } |
856 | } | |
9d986353 | 857 | pool->nr_ranges++; |
b73b0c25 MD |
858 | return range; |
859 | ||
860 | error_alloc: | |
025165ad | 861 | rseq_memfd_close(memfd); |
1a426b47 | 862 | (void) rseq_mempool_range_destroy(pool, range, true); |
b73b0c25 MD |
863 | return NULL; |
864 | } | |
865 | ||
1a426b47 MD |
866 | static |
867 | bool pool_mappings_accessible(struct rseq_mempool *pool) | |
868 | { | |
869 | struct rseq_mempool_range *range; | |
870 | size_t page_size; | |
871 | char *addr; | |
872 | ||
805d0043 | 873 | if (pool->attr.populate_policy != RSEQ_MEMPOOL_POPULATE_COW_INIT) |
1a426b47 | 874 | return true; |
252f9411 | 875 | if (list_empty(&pool->range_list)) |
1a426b47 | 876 | return true; |
252f9411 | 877 | range = list_first_entry(&pool->range_list, struct rseq_mempool_range, node); |
1a426b47 MD |
878 | page_size = rseq_get_page_len(); |
879 | /* | |
880 | * Header first page is one page before the page containing the | |
881 | * range structure. | |
882 | */ | |
883 | addr = (char *) ((uintptr_t) range & ~(page_size - 1)) - page_size; | |
884 | /* | |
885 | * Look for 0x1 first byte marker in header first page. | |
886 | */ | |
887 | if (*addr != 0x1) | |
888 | return false; | |
889 | return true; | |
890 | } | |
891 | ||
0ba2a93e | 892 | int rseq_mempool_destroy(struct rseq_mempool *pool) |
9649c7ee | 893 | { |
252f9411 | 894 | struct rseq_mempool_range *range, *tmp_range; |
1a426b47 | 895 | bool mapping_accessible; |
b73b0c25 | 896 | int ret = 0; |
9649c7ee | 897 | |
f510ddc5 MD |
898 | if (!pool) |
899 | return 0; | |
1a426b47 MD |
900 | |
901 | /* | |
902 | * Validate that the pool mappings are accessible before doing | |
903 | * free list/poison validation and unmapping ranges. This allows | |
805d0043 MD |
904 | * calling pool destroy in child process after a fork for COW_INIT |
905 | * pools to free pool resources. | |
1a426b47 MD |
906 | */ |
907 | mapping_accessible = pool_mappings_accessible(pool); | |
908 | ||
909 | check_free_list(pool, mapping_accessible); | |
910 | check_pool_poison(pool, mapping_accessible); | |
911 | ||
b73b0c25 | 912 | /* Iteration safe against removal. */ |
252f9411 MD |
913 | list_for_each_entry_safe(range, tmp_range, &pool->range_list, node) { |
914 | list_del(&range->node); | |
915 | if (rseq_mempool_range_destroy(pool, range, mapping_accessible)) { | |
916 | /* Keep list coherent in case of partial failure. */ | |
917 | list_add(&range->node, &pool->range_list); | |
b73b0c25 | 918 | goto end; |
252f9411 | 919 | } |
b73b0c25 | 920 | } |
9649c7ee | 921 | pthread_mutex_destroy(&pool->lock); |
ca452fee | 922 | free(pool->name); |
eb8db04d | 923 | free(pool); |
9649c7ee | 924 | end: |
b73b0c25 | 925 | return ret; |
9649c7ee MD |
926 | } |
927 | ||
0ba2a93e | 928 | struct rseq_mempool *rseq_mempool_create(const char *pool_name, |
cb475906 | 929 | size_t item_len, const struct rseq_mempool_attr *_attr) |
ef6695f1 | 930 | { |
0ba2a93e | 931 | struct rseq_mempool_attr attr = {}; |
252f9411 MD |
932 | struct rseq_mempool_range *range; |
933 | struct rseq_mempool *pool; | |
ef6695f1 | 934 | int order; |
ef6695f1 MD |
935 | |
936 | /* Make sure each item is large enough to contain free list pointers. */ | |
937 | if (item_len < sizeof(void *)) | |
938 | item_len = sizeof(void *); | |
939 | ||
940 | /* Align item_len on next power of two. */ | |
19be9217 | 941 | order = rseq_get_count_order_ulong(item_len); |
ef6695f1 MD |
942 | if (order < 0) { |
943 | errno = EINVAL; | |
944 | return NULL; | |
945 | } | |
946 | item_len = 1UL << order; | |
947 | ||
a82006d0 MD |
948 | if (_attr) |
949 | memcpy(&attr, _attr, sizeof(attr)); | |
a82006d0 | 950 | |
805d0043 MD |
951 | /* |
952 | * Validate that the pool populate policy requested is known. | |
953 | */ | |
954 | switch (attr.populate_policy) { | |
955 | case RSEQ_MEMPOOL_POPULATE_COW_INIT: | |
956 | break; | |
957 | case RSEQ_MEMPOOL_POPULATE_COW_ZERO: | |
958 | break; | |
959 | default: | |
960 | errno = EINVAL; | |
961 | return NULL; | |
962 | } | |
963 | ||
cb475906 MD |
964 | switch (attr.type) { |
965 | case MEMPOOL_TYPE_PERCPU: | |
966 | if (attr.max_nr_cpus < 0) { | |
967 | errno = EINVAL; | |
968 | return NULL; | |
969 | } | |
970 | if (attr.max_nr_cpus == 0) { | |
971 | /* Auto-detect */ | |
47c725dd | 972 | attr.max_nr_cpus = rseq_get_max_nr_cpus(); |
cb475906 MD |
973 | if (attr.max_nr_cpus == 0) { |
974 | errno = EINVAL; | |
975 | return NULL; | |
976 | } | |
977 | } | |
978 | break; | |
979 | case MEMPOOL_TYPE_GLOBAL: | |
a5694a4d | 980 | /* Override populate policy for global type. */ |
805d0043 MD |
981 | if (attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT) |
982 | attr.populate_policy = RSEQ_MEMPOOL_POPULATE_COW_ZERO; | |
89b7e681 MD |
983 | /* Use a 1-cpu pool for global mempool type. */ |
984 | attr.max_nr_cpus = 1; | |
cb475906 MD |
985 | break; |
986 | } | |
987 | if (!attr.stride) | |
988 | attr.stride = RSEQ_MEMPOOL_STRIDE; /* Use default */ | |
3975084e MD |
989 | if (attr.robust_set && !attr.poison_set) { |
990 | attr.poison_set = true; | |
805d0043 | 991 | if (attr.populate_policy == RSEQ_MEMPOOL_POPULATE_COW_INIT) |
6ea98a7b MD |
992 | attr.poison = DEFAULT_COW_INIT_POISON_VALUE; |
993 | else | |
994 | attr.poison = DEFAULT_COW_ZERO_POISON_VALUE; | |
3975084e | 995 | } |
cb475906 MD |
996 | if (item_len > attr.stride || attr.stride < (size_t) rseq_get_page_len() || |
997 | !is_pow2(attr.stride)) { | |
998 | errno = EINVAL; | |
999 | return NULL; | |
1000 | } | |
1001 | ||
0ba2a93e | 1002 | pool = calloc(1, sizeof(struct rseq_mempool)); |
bc510b60 MD |
1003 | if (!pool) |
1004 | return NULL; | |
ef6695f1 | 1005 | |
b73b0c25 | 1006 | memcpy(&pool->attr, &attr, sizeof(attr)); |
ef6695f1 | 1007 | pthread_mutex_init(&pool->lock, NULL); |
ef6695f1 MD |
1008 | pool->item_len = item_len; |
1009 | pool->item_order = order; | |
252f9411 | 1010 | INIT_LIST_HEAD(&pool->range_list); |
b73b0c25 | 1011 | |
252f9411 MD |
1012 | range = rseq_mempool_range_create(pool); |
1013 | if (!range) | |
b73b0c25 | 1014 | goto error_alloc; |
252f9411 | 1015 | list_add(&range->node, &pool->range_list); |
0fdf7a4c | 1016 | |
ca452fee MD |
1017 | if (pool_name) { |
1018 | pool->name = strdup(pool_name); | |
1019 | if (!pool->name) | |
1020 | goto error_alloc; | |
1021 | } | |
ef6695f1 | 1022 | return pool; |
ef6695f1 | 1023 | |
9649c7ee | 1024 | error_alloc: |
0ba2a93e | 1025 | rseq_mempool_destroy(pool); |
9649c7ee MD |
1026 | errno = ENOMEM; |
1027 | return NULL; | |
ef6695f1 MD |
1028 | } |
1029 | ||
e7cbbc10 MD |
1030 | /* Always inline for __builtin_return_address(0). */ |
1031 | static inline __attribute__((always_inline)) | |
9d986353 | 1032 | void set_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, size_t item_offset) |
0fdf7a4c | 1033 | { |
9d986353 | 1034 | unsigned long *bitmap = range->alloc_bitmap; |
9649c7ee | 1035 | size_t item_index = item_offset >> pool->item_order; |
0fdf7a4c OD |
1036 | unsigned long mask; |
1037 | size_t k; | |
1038 | ||
9649c7ee | 1039 | if (!bitmap) |
0fdf7a4c | 1040 | return; |
0fdf7a4c | 1041 | |
9649c7ee | 1042 | k = item_index / BIT_PER_ULONG; |
0fdf7a4c OD |
1043 | mask = 1ULL << (item_index % BIT_PER_ULONG); |
1044 | ||
9649c7ee MD |
1045 | /* Print error if bit is already set. */ |
1046 | if (bitmap[k] & mask) { | |
ca452fee MD |
1047 | fprintf(stderr, "%s: Allocator corruption detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n", |
1048 | __func__, get_pool_name(pool), pool, item_offset, (void *) __builtin_return_address(0)); | |
9649c7ee MD |
1049 | abort(); |
1050 | } | |
0fdf7a4c OD |
1051 | bitmap[k] |= mask; |
1052 | } | |
1053 | ||
ef6695f1 | 1054 | static |
6ff43d9a MD |
1055 | void __rseq_percpu *__rseq_percpu_malloc(struct rseq_mempool *pool, |
1056 | bool zeroed, void *init_ptr, size_t init_len) | |
ef6695f1 | 1057 | { |
9d986353 | 1058 | struct rseq_mempool_range *range; |
ef6695f1 MD |
1059 | struct free_list_node *node; |
1060 | uintptr_t item_offset; | |
d24ee051 | 1061 | void __rseq_percpu *addr; |
ef6695f1 | 1062 | |
6ff43d9a MD |
1063 | if (init_len > pool->item_len) { |
1064 | errno = EINVAL; | |
1065 | return NULL; | |
1066 | } | |
ef6695f1 MD |
1067 | pthread_mutex_lock(&pool->lock); |
1068 | /* Get first entry from free list. */ | |
1069 | node = pool->free_list_head; | |
1070 | if (node != NULL) { | |
a5694a4d | 1071 | void *range_base, *ptr; |
9d986353 | 1072 | |
a5694a4d MD |
1073 | ptr = __rseq_free_list_to_percpu_ptr(pool, node); |
1074 | range_base = (void *) ((uintptr_t) ptr & (~(pool->attr.stride - 1))); | |
9d986353 | 1075 | range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET); |
ef6695f1 MD |
1076 | /* Remove node from free list (update head). */ |
1077 | pool->free_list_head = node->next; | |
a5694a4d | 1078 | item_offset = (uintptr_t) (ptr - range_base); |
86617384 | 1079 | rseq_percpu_check_poison_item(pool, range, item_offset); |
a5694a4d | 1080 | addr = __rseq_free_list_to_percpu_ptr(pool, node); |
ef6695f1 MD |
1081 | goto end; |
1082 | } | |
9d986353 | 1083 | /* |
f06c65e7 MD |
1084 | * If there are no ranges, or if the most recent range (first in |
1085 | * list) does not have any room left, create a new range and | |
1086 | * prepend it to the list head. | |
9d986353 | 1087 | */ |
f06c65e7 MD |
1088 | if (list_empty(&pool->range_list)) |
1089 | goto create_range; | |
252f9411 | 1090 | range = list_first_entry(&pool->range_list, struct rseq_mempool_range, node); |
f06c65e7 MD |
1091 | if (range->next_unused + pool->item_len > pool->attr.stride) |
1092 | goto create_range; | |
1093 | else | |
1094 | goto room_left; | |
1095 | create_range: | |
1096 | range = rseq_mempool_range_create(pool); | |
1097 | if (!range) { | |
1098 | errno = ENOMEM; | |
1099 | addr = NULL; | |
1100 | goto end; | |
ef6695f1 | 1101 | } |
f06c65e7 MD |
1102 | /* Add range to head of list. */ |
1103 | list_add(&range->node, &pool->range_list); | |
1104 | room_left: | |
9d986353 MD |
1105 | /* First range in list has room left. */ |
1106 | item_offset = range->next_unused; | |
1107 | addr = (void __rseq_percpu *) (range->base + item_offset); | |
1108 | range->next_unused += pool->item_len; | |
ef6695f1 | 1109 | end: |
ffea0dea MD |
1110 | if (addr) { |
1111 | range->allocated_items++; | |
9d986353 | 1112 | set_alloc_slot(pool, range, item_offset); |
ffea0dea | 1113 | } |
ef6695f1 | 1114 | pthread_mutex_unlock(&pool->lock); |
6ff43d9a MD |
1115 | if (addr) { |
1116 | if (zeroed) | |
1117 | rseq_percpu_zero_item(pool, range, item_offset); | |
1118 | else if (init_ptr) { | |
1119 | rseq_percpu_init_item(pool, range, item_offset, | |
1120 | init_ptr, init_len); | |
1121 | } | |
1122 | } | |
ef6695f1 MD |
1123 | return addr; |
1124 | } | |
1125 | ||
15da5c27 | 1126 | void __rseq_percpu *rseq_mempool_percpu_malloc(struct rseq_mempool *pool) |
ef6695f1 | 1127 | { |
6ff43d9a | 1128 | return __rseq_percpu_malloc(pool, false, NULL, 0); |
ef6695f1 MD |
1129 | } |
1130 | ||
15da5c27 | 1131 | void __rseq_percpu *rseq_mempool_percpu_zmalloc(struct rseq_mempool *pool) |
ef6695f1 | 1132 | { |
6ff43d9a MD |
1133 | return __rseq_percpu_malloc(pool, true, NULL, 0); |
1134 | } | |
1135 | ||
1136 | void __rseq_percpu *rseq_mempool_percpu_malloc_init(struct rseq_mempool *pool, | |
1137 | void *init_ptr, size_t len) | |
1138 | { | |
1139 | return __rseq_percpu_malloc(pool, false, init_ptr, len); | |
ef6695f1 MD |
1140 | } |
1141 | ||
e7cbbc10 MD |
1142 | /* Always inline for __builtin_return_address(0). */ |
1143 | static inline __attribute__((always_inline)) | |
9d986353 | 1144 | void clear_alloc_slot(struct rseq_mempool *pool, struct rseq_mempool_range *range, size_t item_offset) |
0fdf7a4c | 1145 | { |
9d986353 | 1146 | unsigned long *bitmap = range->alloc_bitmap; |
9649c7ee | 1147 | size_t item_index = item_offset >> pool->item_order; |
0fdf7a4c OD |
1148 | unsigned long mask; |
1149 | size_t k; | |
1150 | ||
9649c7ee | 1151 | if (!bitmap) |
0fdf7a4c | 1152 | return; |
0fdf7a4c | 1153 | |
9649c7ee MD |
1154 | k = item_index / BIT_PER_ULONG; |
1155 | mask = 1ULL << (item_index % BIT_PER_ULONG); | |
0fdf7a4c | 1156 | |
9649c7ee MD |
1157 | /* Print error if bit is not set. */ |
1158 | if (!(bitmap[k] & mask)) { | |
ca452fee MD |
1159 | fprintf(stderr, "%s: Double-free detected for pool: \"%s\" (%p), item offset: %zu, caller: %p.\n", |
1160 | __func__, get_pool_name(pool), pool, item_offset, | |
1161 | (void *) __builtin_return_address(0)); | |
9649c7ee MD |
1162 | abort(); |
1163 | } | |
0fdf7a4c OD |
1164 | bitmap[k] &= ~mask; |
1165 | } | |
1166 | ||
cb475906 | 1167 | void librseq_mempool_percpu_free(void __rseq_percpu *_ptr, size_t stride) |
ef6695f1 MD |
1168 | { |
1169 | uintptr_t ptr = (uintptr_t) _ptr; | |
cb475906 | 1170 | void *range_base = (void *) (ptr & (~(stride - 1))); |
0ba2a93e MD |
1171 | struct rseq_mempool_range *range = (struct rseq_mempool_range *) (range_base - RANGE_HEADER_OFFSET); |
1172 | struct rseq_mempool *pool = range->pool; | |
cb475906 | 1173 | uintptr_t item_offset = ptr & (stride - 1); |
ef6695f1 MD |
1174 | struct free_list_node *head, *item; |
1175 | ||
1176 | pthread_mutex_lock(&pool->lock); | |
9d986353 | 1177 | clear_alloc_slot(pool, range, item_offset); |
ffea0dea MD |
1178 | if (!range->allocated_items) { |
1179 | fprintf(stderr, "%s: Trying to free an item from an empty pool range within pool \"%s\" (%p), item offset: %zu, caller: %p.\n", | |
1180 | __func__, get_pool_name(pool), pool, item_offset, | |
1181 | (void *) __builtin_return_address(0)); | |
1182 | abort(); | |
1183 | } | |
1184 | range->allocated_items--; | |
ef6695f1 MD |
1185 | /* Add ptr to head of free list */ |
1186 | head = pool->free_list_head; | |
455e090e MD |
1187 | if (pool->attr.poison_set) |
1188 | rseq_percpu_poison_item(pool, range, item_offset); | |
a5694a4d | 1189 | item = __rseq_percpu_to_free_list_ptr(pool, _ptr); |
455e090e MD |
1190 | /* |
1191 | * Setting the next pointer will overwrite the first uintptr_t | |
805d0043 MD |
1192 | * poison for either CPU 0 (COW_ZERO, non-robust), or init data |
1193 | * (COW_INIT, non-robust). | |
455e090e | 1194 | */ |
ef6695f1 MD |
1195 | item->next = head; |
1196 | pool->free_list_head = item; | |
1197 | pthread_mutex_unlock(&pool->lock); | |
1198 | } | |
1199 | ||
0ba2a93e | 1200 | struct rseq_mempool_set *rseq_mempool_set_create(void) |
ef6695f1 | 1201 | { |
0ba2a93e | 1202 | struct rseq_mempool_set *pool_set; |
ef6695f1 | 1203 | |
0ba2a93e | 1204 | pool_set = calloc(1, sizeof(struct rseq_mempool_set)); |
ef6695f1 MD |
1205 | if (!pool_set) |
1206 | return NULL; | |
1207 | pthread_mutex_init(&pool_set->lock, NULL); | |
1208 | return pool_set; | |
1209 | } | |
1210 | ||
0ba2a93e | 1211 | int rseq_mempool_set_destroy(struct rseq_mempool_set *pool_set) |
ef6695f1 MD |
1212 | { |
1213 | int order, ret; | |
1214 | ||
1215 | for (order = POOL_SET_MIN_ENTRY; order < POOL_SET_NR_ENTRIES; order++) { | |
0ba2a93e | 1216 | struct rseq_mempool *pool = pool_set->entries[order]; |
ef6695f1 MD |
1217 | |
1218 | if (!pool) | |
1219 | continue; | |
0ba2a93e | 1220 | ret = rseq_mempool_destroy(pool); |
ef6695f1 MD |
1221 | if (ret) |
1222 | return ret; | |
1223 | pool_set->entries[order] = NULL; | |
1224 | } | |
1225 | pthread_mutex_destroy(&pool_set->lock); | |
1226 | free(pool_set); | |
1227 | return 0; | |
1228 | } | |
1229 | ||
1230 | /* Ownership of pool is handed over to pool set on success. */ | |
0ba2a93e | 1231 | int rseq_mempool_set_add_pool(struct rseq_mempool_set *pool_set, struct rseq_mempool *pool) |
ef6695f1 MD |
1232 | { |
1233 | size_t item_order = pool->item_order; | |
1234 | int ret = 0; | |
1235 | ||
1236 | pthread_mutex_lock(&pool_set->lock); | |
1237 | if (pool_set->entries[item_order]) { | |
1238 | errno = EBUSY; | |
1239 | ret = -1; | |
1240 | goto end; | |
1241 | } | |
1242 | pool_set->entries[pool->item_order] = pool; | |
1243 | end: | |
1244 | pthread_mutex_unlock(&pool_set->lock); | |
1245 | return ret; | |
1246 | } | |
1247 | ||
1248 | static | |
6ff43d9a MD |
1249 | void __rseq_percpu *__rseq_mempool_set_malloc(struct rseq_mempool_set *pool_set, |
1250 | void *init_ptr, size_t len, bool zeroed) | |
ef6695f1 MD |
1251 | { |
1252 | int order, min_order = POOL_SET_MIN_ENTRY; | |
0ba2a93e | 1253 | struct rseq_mempool *pool; |
d24ee051 | 1254 | void __rseq_percpu *addr; |
ef6695f1 | 1255 | |
d06f5cf5 MD |
1256 | order = rseq_get_count_order_ulong(len); |
1257 | if (order > POOL_SET_MIN_ENTRY) | |
1258 | min_order = order; | |
ef6695f1 MD |
1259 | again: |
1260 | pthread_mutex_lock(&pool_set->lock); | |
1261 | /* First smallest present pool where @len fits. */ | |
1262 | for (order = min_order; order < POOL_SET_NR_ENTRIES; order++) { | |
1263 | pool = pool_set->entries[order]; | |
1264 | ||
1265 | if (!pool) | |
1266 | continue; | |
1267 | if (pool->item_len >= len) | |
1268 | goto found; | |
1269 | } | |
1270 | pool = NULL; | |
1271 | found: | |
1272 | pthread_mutex_unlock(&pool_set->lock); | |
1273 | if (pool) { | |
6ff43d9a | 1274 | addr = __rseq_percpu_malloc(pool, zeroed, init_ptr, len); |
ef6695f1 MD |
1275 | if (addr == NULL && errno == ENOMEM) { |
1276 | /* | |
1277 | * If the allocation failed, try again with a | |
1278 | * larger pool. | |
1279 | */ | |
1280 | min_order = order + 1; | |
1281 | goto again; | |
1282 | } | |
1283 | } else { | |
1284 | /* Not found. */ | |
1285 | errno = ENOMEM; | |
1286 | addr = NULL; | |
1287 | } | |
1288 | return addr; | |
1289 | } | |
1290 | ||
15da5c27 | 1291 | void __rseq_percpu *rseq_mempool_set_percpu_malloc(struct rseq_mempool_set *pool_set, size_t len) |
ef6695f1 | 1292 | { |
6ff43d9a | 1293 | return __rseq_mempool_set_malloc(pool_set, NULL, len, false); |
ef6695f1 MD |
1294 | } |
1295 | ||
15da5c27 | 1296 | void __rseq_percpu *rseq_mempool_set_percpu_zmalloc(struct rseq_mempool_set *pool_set, size_t len) |
ef6695f1 | 1297 | { |
6ff43d9a MD |
1298 | return __rseq_mempool_set_malloc(pool_set, NULL, len, true); |
1299 | } | |
1300 | ||
1301 | void __rseq_percpu *rseq_mempool_set_percpu_malloc_init(struct rseq_mempool_set *pool_set, | |
1302 | void *init_ptr, size_t len) | |
1303 | { | |
1304 | return __rseq_mempool_set_malloc(pool_set, init_ptr, len, true); | |
ef6695f1 | 1305 | } |
9bd07c29 | 1306 | |
0ba2a93e | 1307 | struct rseq_mempool_attr *rseq_mempool_attr_create(void) |
a82006d0 | 1308 | { |
0ba2a93e | 1309 | return calloc(1, sizeof(struct rseq_mempool_attr)); |
a82006d0 MD |
1310 | } |
1311 | ||
0ba2a93e | 1312 | void rseq_mempool_attr_destroy(struct rseq_mempool_attr *attr) |
a82006d0 MD |
1313 | { |
1314 | free(attr); | |
1315 | } | |
1316 | ||
135811f2 | 1317 | int rseq_mempool_attr_set_init(struct rseq_mempool_attr *attr, |
6e329183 | 1318 | int (*init_func)(void *priv, void *addr, size_t len, int cpu), |
135811f2 MD |
1319 | void *init_priv) |
1320 | { | |
1321 | if (!attr) { | |
1322 | errno = EINVAL; | |
1323 | return -1; | |
1324 | } | |
1325 | attr->init_set = true; | |
1326 | attr->init_func = init_func; | |
1327 | attr->init_priv = init_priv; | |
805d0043 | 1328 | attr->populate_policy = RSEQ_MEMPOOL_POPULATE_COW_INIT; |
135811f2 MD |
1329 | return 0; |
1330 | } | |
1331 | ||
0ba2a93e | 1332 | int rseq_mempool_attr_set_robust(struct rseq_mempool_attr *attr) |
d6acc8aa MD |
1333 | { |
1334 | if (!attr) { | |
1335 | errno = EINVAL; | |
1336 | return -1; | |
1337 | } | |
1338 | attr->robust_set = true; | |
1339 | return 0; | |
1340 | } | |
cb475906 MD |
1341 | |
1342 | int rseq_mempool_attr_set_percpu(struct rseq_mempool_attr *attr, | |
1343 | size_t stride, int max_nr_cpus) | |
1344 | { | |
1345 | if (!attr) { | |
1346 | errno = EINVAL; | |
1347 | return -1; | |
1348 | } | |
1349 | attr->type = MEMPOOL_TYPE_PERCPU; | |
1350 | attr->stride = stride; | |
1351 | attr->max_nr_cpus = max_nr_cpus; | |
1352 | return 0; | |
1353 | } | |
1354 | ||
1355 | int rseq_mempool_attr_set_global(struct rseq_mempool_attr *attr, | |
1356 | size_t stride) | |
1357 | { | |
1358 | if (!attr) { | |
1359 | errno = EINVAL; | |
1360 | return -1; | |
1361 | } | |
1362 | attr->type = MEMPOOL_TYPE_GLOBAL; | |
1363 | attr->stride = stride; | |
89b7e681 | 1364 | attr->max_nr_cpus = 0; |
cb475906 MD |
1365 | return 0; |
1366 | } | |
6037d364 | 1367 | |
e11a02d7 MD |
1368 | int rseq_mempool_attr_set_max_nr_ranges(struct rseq_mempool_attr *attr, |
1369 | unsigned long max_nr_ranges) | |
1370 | { | |
1371 | if (!attr) { | |
1372 | errno = EINVAL; | |
1373 | return -1; | |
1374 | } | |
1375 | attr->max_nr_ranges = max_nr_ranges; | |
1376 | return 0; | |
1377 | } | |
1378 | ||
455e090e MD |
1379 | int rseq_mempool_attr_set_poison(struct rseq_mempool_attr *attr, |
1380 | uintptr_t poison) | |
1381 | { | |
1382 | if (!attr) { | |
1383 | errno = EINVAL; | |
1384 | return -1; | |
1385 | } | |
1386 | attr->poison_set = true; | |
1387 | attr->poison = poison; | |
1388 | return 0; | |
1389 | } | |
1390 | ||
a5694a4d MD |
1391 | int rseq_mempool_attr_set_populate_policy(struct rseq_mempool_attr *attr, |
1392 | enum rseq_mempool_populate_policy policy) | |
1393 | { | |
1394 | if (!attr) { | |
1395 | errno = EINVAL; | |
1396 | return -1; | |
1397 | } | |
1398 | attr->populate_policy = policy; | |
1399 | return 0; | |
1400 | } | |
1401 | ||
6037d364 MD |
1402 | int rseq_mempool_get_max_nr_cpus(struct rseq_mempool *mempool) |
1403 | { | |
1404 | if (!mempool || mempool->attr.type != MEMPOOL_TYPE_PERCPU) { | |
1405 | errno = EINVAL; | |
1406 | return -1; | |
1407 | } | |
1408 | return mempool->attr.max_nr_cpus; | |
1409 | } |