From 367e559c27a9c9637ff26c9531fd44ef22991aea Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 29 Feb 2024 17:24:33 -0500 Subject: [PATCH] rseq percpu alloc: Implement numa support Signed-off-by: Mathieu Desnoyers Change-Id: I732b632f476ffef362a1ab486bcf425e4ded6644 --- configure.ac | 25 ++++++- include/rseq/percpu-alloc.h | 3 +- src/Makefile.am | 4 ++ src/rseq-percpu-alloc.c | 130 ++++++++++++++++++++++++++---------- 4 files changed, 123 insertions(+), 39 deletions(-) diff --git a/configure.ac b/configure.ac index ac0883f..4f58d02 100644 --- a/configure.ac +++ b/configure.ac @@ -196,11 +196,32 @@ PKG_CHECK_MODULES([SECCOMP], [libseccomp], ## Optional features selection ## ## ## +# Enabled by default +AE_FEATURE_DEFAULT_ENABLE +AE_FEATURE([numa],[disable NUMA support]) + # When given, add -Werror to WARN_CFLAGS and WARN_CXXFLAGS. # Disabled by default AE_FEATURE_DEFAULT_DISABLE AE_FEATURE([Werror], [Treat compiler warnings as errors.]) +## ## +## Check for optional features dependencies ## +## ## + +# The numa integration requires libnuma +AE_IF_FEATURE_ENABLED([numa], [ + AC_CHECK_LIB([numa], [numa_available], [ + AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if libnuma is available.]) + ], [ + AC_MSG_ERROR([dnl +libnuma is not available. Please either install it (e.g. libnuma-dev) or use +[LDFLAGS]=-Ldir to specify the right location, or use --disable-numa configure +argument to disable NUMA support. + ]) + ]) +]) + ## ## ## Set automake variables for optional feature conditionnals in Makefile.am ## @@ -208,7 +229,7 @@ AE_FEATURE([Werror], [Treat compiler warnings as errors.]) AM_CONDITIONAL([ENABLE_SHARED], [test "x${enable_shared}" = "xyes"]) AM_CONDITIONAL([ENABLE_SECCOMP], [test "x${have_seccomp}" = "xyes"]) - +AM_CONDITIONAL([ENABLE_NUMA], AE_IS_FEATURE_ENABLED([numa])) ## ## ## Substitute variables for use in Makefile.am ## @@ -264,6 +285,8 @@ AS_ECHO PPRINT_SUBTITLE([Features]) PPRINT_PROP_STRING([Target architecture], $host_cpu) +AE_IS_FEATURE_ENABLED([numa]) && value=1 || value=0 +PPRINT_PROP_BOOL([NUMA], $value) report_bindir="`eval eval echo $bindir`" report_libdir="`eval eval echo $libdir`" diff --git a/include/rseq/percpu-alloc.h b/include/rseq/percpu-alloc.h index c1ea96a..546f5c0 100644 --- a/include/rseq/percpu-alloc.h +++ b/include/rseq/percpu-alloc.h @@ -15,7 +15,8 @@ struct rseq_percpu_pool; struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len, size_t percpu_len, int max_nr_cpus, - int prot, int flags, int fd, off_t offset); + int mmap_prot, int mmap_flags, int mmap_fd, off_t mmap_offset, + int numa_flags); int rseq_percpu_pool_destroy(struct rseq_percpu_pool *pool); void *rseq_percpu_malloc(struct rseq_percpu_pool *pool); diff --git a/src/Makefile.am b/src/Makefile.am index c9e134c..9ef6cfb 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -9,5 +9,9 @@ librseq_la_SOURCES = \ librseq_la_LDFLAGS = -no-undefined -version-info $(RSEQ_LIBRARY_VERSION) librseq_la_LIBADD = $(DL_LIBS) +if ENABLE_NUMA +librseq_la_LIBADD += -lnuma +endif + pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = librseq.pc diff --git a/src/rseq-percpu-alloc.c b/src/rseq-percpu-alloc.c index 12f8b2b..3b48d51 100644 --- a/src/rseq-percpu-alloc.c +++ b/src/rseq-percpu-alloc.c @@ -12,6 +12,12 @@ #include #include #include +#include + +#ifdef HAVE_LIBNUMA +# include +# include +#endif /* * rseq-percpu-alloc.c: rseq per-cpu memory allocator. @@ -176,15 +182,96 @@ int get_count_order_ulong(unsigned long x) return fls_ulong(x - 1); } +static +long rseq_get_page_len(void) +{ + long page_len = sysconf(_SC_PAGE_SIZE); + + if (page_len < 0) + page_len = DEFAULT_PAGE_SIZE; + return page_len; +} + +static +void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool *pool, int cpu, uintptr_t item_offset) +{ + return pool->base + (pool->percpu_len * cpu) + item_offset; +} + +void *__rseq_percpu_ptr(void *_ptr, int cpu) +{ + uintptr_t ptr = (uintptr_t) _ptr; + uintptr_t item_offset = ptr >> OFFSET_SHIFT; + uintptr_t pool_index = ptr & POOL_MASK; + struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index]; + + assert(cpu >= 0); + return __rseq_pool_percpu_ptr(pool, cpu, item_offset); +} + +static +void rseq_percpu_zero_item(struct rseq_percpu_pool *pool, uintptr_t item_offset) +{ + int i; + + for (i = 0; i < pool->max_nr_cpus; i++) { + char *p = __rseq_pool_percpu_ptr(pool, i, item_offset); + memset(p, 0, pool->item_len); + } +} + +#ifdef HAVE_LIBNUMA +static +void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool, + int numa_flags) +{ + unsigned long nr_pages, page; + long ret, page_len; + int cpu; + + if (!numa_flags) + return; + page_len = rseq_get_page_len(); + nr_pages = pool->percpu_len >> get_count_order_ulong(page_len); + for (cpu = 0; cpu < pool->max_nr_cpus; cpu++) { + int node = numa_node_of_cpu(cpu); + + /* TODO: batch move_pages() call with an array of pages. */ + for (page = 0; page < nr_pages; page++) { + void *pageptr = __rseq_pool_percpu_ptr(pool, cpu, page * page_len); + int status = -EPERM; + + ret = move_pages(0, 1, &pageptr, &node, &status, numa_flags); + if (ret) { + perror("move_pages"); + abort(); + } + } + } +} +#else +static +void rseq_percpu_pool_init_numa(struct rseq_percpu_pool *pool __attribute__((unused)), + int numa_flags __attribute__((unused))) +{ +} +#endif + +/* + * Expected numa_flags: + * 0: do not move pages to specific numa nodes (use for e.g. mm_cid indexing). + * MPOL_MF_MOVE: move process-private pages to cpu-specific numa nodes. + * MPOL_MF_MOVE_ALL: move shared pages to cpu-specific numa nodes (requires CAP_SYS_NICE). + */ struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len, size_t percpu_len, int max_nr_cpus, - int prot, int flags, int fd, off_t offset) + int mmap_prot, int mmap_flags, int mmap_fd, + off_t mmap_offset, int numa_flags) { struct rseq_percpu_pool *pool; void *base; unsigned int i; int order; - long page_len; /* Make sure each item is large enough to contain free list pointers. */ if (item_len < sizeof(void *)) @@ -199,10 +286,7 @@ struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len, item_len = 1UL << order; /* Align percpu_len on page size. */ - page_len = sysconf(_SC_PAGE_SIZE); - if (page_len < 0) - page_len = DEFAULT_PAGE_SIZE; - percpu_len = rseq_align(percpu_len, page_len); + percpu_len = rseq_align(percpu_len, rseq_get_page_len()); if (max_nr_cpus < 0 || item_len > percpu_len || percpu_len > (UINTPTR_MAX >> OFFSET_SHIFT)) { @@ -222,13 +306,13 @@ struct rseq_percpu_pool *rseq_percpu_pool_create(size_t item_len, goto end; found_empty: - base = mmap(NULL, percpu_len * max_nr_cpus, prot, flags, fd, offset); + base = mmap(NULL, percpu_len * max_nr_cpus, mmap_prot, + mmap_flags, mmap_fd, mmap_offset); if (base == MAP_FAILED) { pool = NULL; goto end; } - // TODO: integrate with libnuma to provide NUMA placement hints. - // See move_pages(2). + rseq_percpu_pool_init_numa(pool, numa_flags); pthread_mutex_init(&pool->lock, NULL); pool->base = base; pool->percpu_len = percpu_len; @@ -261,34 +345,6 @@ end: return 0; } -static -void *__rseq_pool_percpu_ptr(struct rseq_percpu_pool *pool, int cpu, uintptr_t item_offset) -{ - return pool->base + (pool->percpu_len * cpu) + item_offset; -} - -void *__rseq_percpu_ptr(void *_ptr, int cpu) -{ - uintptr_t ptr = (uintptr_t) _ptr; - uintptr_t item_offset = ptr >> OFFSET_SHIFT; - uintptr_t pool_index = ptr & POOL_MASK; - struct rseq_percpu_pool *pool = &rseq_percpu_pool[pool_index]; - - assert(cpu >= 0); - return __rseq_pool_percpu_ptr(pool, cpu, item_offset); -} - -static -void rseq_percpu_zero_item(struct rseq_percpu_pool *pool, uintptr_t item_offset) -{ - int i; - - for (i = 0; i < pool->max_nr_cpus; i++) { - char *p = __rseq_pool_percpu_ptr(pool, i, item_offset); - memset(p, 0, pool->item_len); - } -} - static void *__rseq_percpu_malloc(struct rseq_percpu_pool *pool, bool zeroed) { -- 2.34.1