mempool: Introduce optional stride parameter
[librseq.git] / tests / param_test.c
index b423767b2e8ef7c79bd0d036098f25898f69b84b..bb8b15a3957bb984902ad34051a1d79b90d40854 100644 (file)
@@ -1,8 +1,11 @@
-// SPDX-License-Identifier: LGPL-2.1-only
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2020-2022 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #endif
 #include <assert.h>
+#include <linux/version.h>
+#include <linux/membarrier.h>
 #include <pthread.h>
 #include <sched.h>
 #include <stdint.h>
 #include <signal.h>
 #include <errno.h>
 #include <stddef.h>
+#include <stdbool.h>
+#include <rseq/mempool.h>
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5,10,0)
+enum {
+       MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ                   = (1 << 7),
+       MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ          = (1 << 8),
+};
+
+enum {
+       MEMBARRIER_CMD_FLAG_CPU         = (1 << 0),
+};
+#endif
 
 #define NR_INJECT      9
 static int loop_cnt[NR_INJECT + 1];
@@ -31,24 +47,20 @@ static int opt_modulo, verbose;
 
 static int opt_yield, opt_signal, opt_sleep,
                opt_disable_rseq, opt_threads = 200,
-               opt_disable_mod = 0, opt_test = 's', opt_mb = 0;
+               opt_disable_mod = 0, opt_test = 's';
 
-#ifndef RSEQ_SKIP_FASTPATH
 static long long opt_reps = 5000;
-#else
-static long long opt_reps = 100;
-#endif
 
 static __thread __attribute__((tls_model("initial-exec")))
 unsigned int signals_delivered;
 
-#ifndef BENCHMARK
-
 static inline pid_t rseq_gettid(void)
 {
        return syscall(__NR_gettid);
 }
 
+#ifndef BENCHMARK
+
 static __thread __attribute__((tls_model("initial-exec"), unused))
 int yield_mod_cnt, nr_abort;
 
@@ -65,8 +77,13 @@ int yield_mod_cnt, nr_abort;
 #define RSEQ_INJECT_CLOBBER \
        , INJECT_ASM_REG
 
-#define RSEQ_INJECT_ASM(n) \
-       "mov asm_loop_cnt_" #n ", %%" INJECT_ASM_REG "\n\t" \
+/*
+ * Use ip-relative addressing to get the loop counter.
+ */
+#define __RSEQ_INJECT_ASM(n, ref_ip, ref_label) \
+       "movl " __rseq_str(ref_ip) ", %%" INJECT_ASM_REG "\n\t" \
+       "leal ( asm_loop_cnt_" #n " - " __rseq_str(ref_label) "b)(%%" INJECT_ASM_REG "), %%" INJECT_ASM_REG "\n\t" \
+       "movl (%%" INJECT_ASM_REG "), %%" INJECT_ASM_REG "\n\t" \
        "test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \
        "jz 333f\n\t" \
        "222:\n\t" \
@@ -74,6 +91,9 @@ int yield_mod_cnt, nr_abort;
        "jnz 222b\n\t" \
        "333:\n\t"
 
+#define RSEQ_INJECT_ASM(n) \
+       __RSEQ_INJECT_ASM(n, %[ref_ip], RSEQ_ASM_REF_LABEL)
+
 #elif defined(__x86_64__)
 
 #define INJECT_ASM_REG_P       "rax"
@@ -161,7 +181,7 @@ int yield_mod_cnt, nr_abort;
        "       cbnz    " INJECT_ASM_REG ", 222b\n"                     \
        "333:\n"
 
-#elif __PPC__
+#elif defined(__PPC__)
 
 #define RSEQ_INJECT_INPUT \
        , [loop_cnt_1]"m"(loop_cnt[1]) \
@@ -208,6 +228,29 @@ int yield_mod_cnt, nr_abort;
        "bnez " INJECT_ASM_REG ", 222b\n\t" \
        "333:\n\t"
 
+#elif defined(__riscv)
+
+#define RSEQ_INJECT_INPUT \
+       , [loop_cnt_1]"m"(loop_cnt[1]) \
+       , [loop_cnt_2]"m"(loop_cnt[2]) \
+       , [loop_cnt_3]"m"(loop_cnt[3]) \
+       , [loop_cnt_4]"m"(loop_cnt[4]) \
+       , [loop_cnt_5]"m"(loop_cnt[5]) \
+       , [loop_cnt_6]"m"(loop_cnt[6])
+
+#define INJECT_ASM_REG  "t1"
+
+#define RSEQ_INJECT_CLOBBER \
+       , INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n)                                      \
+       "lw " INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t"         \
+       "beqz " INJECT_ASM_REG ", 333f\n\t"                     \
+       "222:\n\t"                                              \
+       "addi  " INJECT_ASM_REG "," INJECT_ASM_REG ", -1\n\t"   \
+       "bnez " INJECT_ASM_REG ", 222b\n\t"                     \
+       "333:\n\t"
+
 #else
 #error unsupported target
 #endif
@@ -245,35 +288,94 @@ int yield_mod_cnt, nr_abort;
 
 #include <rseq/rseq.h>
 
-struct percpu_lock_entry {
-       intptr_t v;
-} __attribute__((aligned(128)));
+static enum rseq_mo opt_mo = RSEQ_MO_RELAXED;
+
+static int sys_membarrier(int cmd, int flags, int cpu_id)
+{
+       return syscall(__NR_membarrier, cmd, flags, cpu_id);
+}
+
+#ifdef rseq_arch_has_load_cbne_load_add_load_add_store
+#define TEST_MEMBARRIER
+#endif
+
+#ifdef BUILDOPT_RSEQ_PERCPU_MM_CID
+# define RSEQ_PERCPU   RSEQ_PERCPU_MM_CID
+static
+int get_current_cpu_id(void)
+{
+       return rseq_current_mm_cid();
+}
+static
+bool rseq_validate_cpu_id(void)
+{
+       return rseq_mm_cid_available();
+}
+static
+bool rseq_use_cpu_index(void)
+{
+       return false;   /* Use mm_cid */
+}
+# ifdef TEST_MEMBARRIER
+/*
+ * Membarrier does not currently support targeting a mm_cid, so
+ * issue the barrier on all cpus.
+ */
+static
+int rseq_membarrier_expedited(__attribute__ ((unused)) int cpu)
+{
+       return sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
+                             0, 0);
+}
+# endif /* TEST_MEMBARRIER */
+#else
+# define RSEQ_PERCPU   RSEQ_PERCPU_CPU_ID
+static
+int get_current_cpu_id(void)
+{
+       return rseq_cpu_start();
+}
+static
+bool rseq_validate_cpu_id(void)
+{
+       return rseq_current_cpu_raw() >= 0;
+}
+static
+bool rseq_use_cpu_index(void)
+{
+       return true;    /* Use cpu_id as index. */
+}
+# ifdef TEST_MEMBARRIER
+static
+int rseq_membarrier_expedited(int cpu)
+{
+       return sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
+                             MEMBARRIER_CMD_FLAG_CPU, cpu);
+}
+# endif /* TEST_MEMBARRIER */
+#endif
 
 struct percpu_lock {
-       struct percpu_lock_entry c[CPU_SETSIZE];
+       intptr_t v;
 };
 
-struct test_data_entry {
-       intptr_t count;
-} __attribute__((aligned(128)));
-
 struct spinlock_test_data {
        struct percpu_lock lock;
-       struct test_data_entry c[CPU_SETSIZE];
+       intptr_t count;
 };
 
 struct spinlock_thread_test_data {
-       struct spinlock_test_data *data;
+       struct spinlock_test_data __rseq_percpu *data;
        long long reps;
        int reg;
 };
 
 struct inc_test_data {
-       struct test_data_entry c[CPU_SETSIZE];
+       intptr_t count;
 };
 
 struct inc_thread_test_data {
-       struct inc_test_data *data;
+       struct inc_test_data __rseq_percpu *data;
        long long reps;
        int reg;
 };
@@ -283,12 +385,8 @@ struct percpu_list_node {
        struct percpu_list_node *next;
 };
 
-struct percpu_list_entry {
-       struct percpu_list_node *head;
-} __attribute__((aligned(128)));
-
 struct percpu_list {
-       struct percpu_list_entry c[CPU_SETSIZE];
+       struct percpu_list_node *head;
 };
 
 #define BUFFER_ITEM_PER_CPU    100
@@ -297,14 +395,10 @@ struct percpu_buffer_node {
        intptr_t data;
 };
 
-struct percpu_buffer_entry {
+struct percpu_buffer {
        intptr_t offset;
        intptr_t buflen;
        struct percpu_buffer_node **array;
-} __attribute__((aligned(128)));
-
-struct percpu_buffer {
-       struct percpu_buffer_entry c[CPU_SETSIZE];
 };
 
 #define MEMCPY_BUFFER_ITEM_PER_CPU     100
@@ -314,26 +408,28 @@ struct percpu_memcpy_buffer_node {
        uint64_t data2;
 };
 
-struct percpu_memcpy_buffer_entry {
+struct percpu_memcpy_buffer {
        intptr_t offset;
        intptr_t buflen;
        struct percpu_memcpy_buffer_node *array;
-} __attribute__((aligned(128)));
-
-struct percpu_memcpy_buffer {
-       struct percpu_memcpy_buffer_entry c[CPU_SETSIZE];
 };
 
 /* A simple percpu spinlock. Grabs lock on current cpu. */
-static int rseq_this_cpu_lock(struct percpu_lock *lock)
+static int rseq_this_cpu_lock(struct percpu_lock __rseq_percpu *lock)
 {
        int cpu;
 
        for (;;) {
                int ret;
 
-               cpu = rseq_cpu_start();
-               ret = rseq_cmpeqv_storev(&lock->c[cpu].v,
+               cpu = get_current_cpu_id();
+               if (cpu < 0) {
+                       fprintf(stderr, "pid: %d: tid: %d, cpu: %d: cid: %d\n",
+                               getpid(), (int) rseq_gettid(), rseq_current_cpu_raw(), cpu);
+                       abort();
+               }
+               ret = rseq_load_cbne_store__ptr(RSEQ_MO_RELAXED, RSEQ_PERCPU,
+                                        &rseq_percpu_ptr(lock, cpu)->v,
                                         0, 1, cpu);
                if (rseq_likely(!ret))
                        break;
@@ -347,20 +443,20 @@ static int rseq_this_cpu_lock(struct percpu_lock *lock)
        return cpu;
 }
 
-static void rseq_percpu_unlock(struct percpu_lock *lock, int cpu)
+static void rseq_percpu_unlock(struct percpu_lock __rseq_percpu *lock, int cpu)
 {
-       assert(lock->c[cpu].v == 1);
+       assert(rseq_percpu_ptr(lock, cpu)->v == 1);
        /*
         * Release lock, with release semantic. Matches
         * rseq_smp_acquire__after_ctrl_dep().
         */
-       rseq_smp_store_release(&lock->c[cpu].v, 0);
+       rseq_smp_store_release(&rseq_percpu_ptr(lock, cpu)->v, 0);
 }
 
-void *test_percpu_spinlock_thread(void *arg)
+static void *test_percpu_spinlock_thread(void *arg)
 {
        struct spinlock_thread_test_data *thread_data = (struct spinlock_thread_test_data *) arg;
-       struct spinlock_test_data *data = thread_data->data;
+       struct spinlock_test_data __rseq_percpu *data = thread_data->data;
        long long i, reps;
 
        if (!opt_disable_rseq && thread_data->reg &&
@@ -368,10 +464,8 @@ void *test_percpu_spinlock_thread(void *arg)
                abort();
        reps = thread_data->reps;
        for (i = 0; i < reps; i++) {
-               int cpu = rseq_cpu_start();
-
-               cpu = rseq_this_cpu_lock(&data->lock);
-               data->c[cpu].count++;
+               int cpu = rseq_this_cpu_lock(&data->lock);
+               rseq_percpu_ptr(data, cpu)->count++;
                rseq_percpu_unlock(&data->lock, cpu);
 #ifndef BENCHMARK
                if (i != 0 && !(i % (reps / 10)))
@@ -393,23 +487,36 @@ void *test_percpu_spinlock_thread(void *arg)
  * per-cpu increment; however, this is reasonable for a test and the
  * lock can be extended to synchronize more complicated operations.
  */
-void test_percpu_spinlock(void)
+static void test_percpu_spinlock(void)
 {
        const int num_threads = opt_threads;
        int i, ret;
        uint64_t sum;
        pthread_t test_threads[num_threads];
-       struct spinlock_test_data data;
+       struct spinlock_test_data __rseq_percpu *data;
        struct spinlock_thread_test_data thread_data[num_threads];
+       struct rseq_mempool *mempool;
+
+       mempool = rseq_mempool_create("spinlock_test_data",
+                       sizeof(struct spinlock_test_data),
+                       0, CPU_SETSIZE, NULL);
+       if (!mempool) {
+               perror("rseq_mempool_create");
+               abort();
+       }
+       data = (struct spinlock_test_data __rseq_percpu *)rseq_percpu_zmalloc(mempool);
+       if (!data) {
+               perror("rseq_percpu_zmalloc");
+               abort();
+       }
 
-       memset(&data, 0, sizeof(data));
        for (i = 0; i < num_threads; i++) {
                thread_data[i].reps = opt_reps;
                if (opt_disable_mod <= 0 || (i % opt_disable_mod))
                        thread_data[i].reg = 1;
                else
                        thread_data[i].reg = 0;
-               thread_data[i].data = &data;
+               thread_data[i].data = data;
                ret = pthread_create(&test_threads[i], NULL,
                                     test_percpu_spinlock_thread,
                                     &thread_data[i]);
@@ -431,15 +538,21 @@ void test_percpu_spinlock(void)
 
        sum = 0;
        for (i = 0; i < CPU_SETSIZE; i++)
-               sum += data.c[i].count;
+               sum += rseq_percpu_ptr(data, i)->count;
 
        assert(sum == (uint64_t)opt_reps * num_threads);
+       rseq_percpu_free(data);
+       ret = rseq_mempool_destroy(mempool);
+       if (ret) {
+               perror("rseq_mempool_destroy");
+               abort();
+       }
 }
 
-void *test_percpu_inc_thread(void *arg)
+static void *test_percpu_inc_thread(void *arg)
 {
        struct inc_thread_test_data *thread_data = (struct inc_thread_test_data *) arg;
-       struct inc_test_data *data = thread_data->data;
+       struct inc_test_data __rseq_percpu *data = thread_data->data;
        long long i, reps;
 
        if (!opt_disable_rseq && thread_data->reg &&
@@ -452,8 +565,9 @@ void *test_percpu_inc_thread(void *arg)
                do {
                        int cpu;
 
-                       cpu = rseq_cpu_start();
-                       ret = rseq_addv(&data->c[cpu].count, 1, cpu);
+                       cpu = get_current_cpu_id();
+                       ret = rseq_load_add_store__ptr(RSEQ_MO_RELAXED, RSEQ_PERCPU,
+                                       &rseq_percpu_ptr(data, cpu)->count, 1, cpu);
                } while (rseq_unlikely(ret));
 #ifndef BENCHMARK
                if (i != 0 && !(i % (reps / 10)))
@@ -469,23 +583,36 @@ void *test_percpu_inc_thread(void *arg)
        return NULL;
 }
 
-void test_percpu_inc(void)
+static void test_percpu_inc(void)
 {
        const int num_threads = opt_threads;
        int i, ret;
        uint64_t sum;
        pthread_t test_threads[num_threads];
-       struct inc_test_data data;
+       struct inc_test_data __rseq_percpu *data;
        struct inc_thread_test_data thread_data[num_threads];
+       struct rseq_mempool *mempool;
+
+       mempool = rseq_mempool_create("inc_test_data",
+                       sizeof(struct inc_test_data),
+                       0, CPU_SETSIZE, NULL);
+       if (!mempool) {
+               perror("rseq_mempool_create");
+               abort();
+       }
+       data = (struct inc_test_data __rseq_percpu *)rseq_percpu_zmalloc(mempool);
+       if (!data) {
+               perror("rseq_percpu_zmalloc");
+               abort();
+       }
 
-       memset(&data, 0, sizeof(data));
        for (i = 0; i < num_threads; i++) {
                thread_data[i].reps = opt_reps;
                if (opt_disable_mod <= 0 || (i % opt_disable_mod))
                        thread_data[i].reg = 1;
                else
                        thread_data[i].reg = 0;
-               thread_data[i].data = &data;
+               thread_data[i].data = data;
                ret = pthread_create(&test_threads[i], NULL,
                                     test_percpu_inc_thread,
                                     &thread_data[i]);
@@ -507,12 +634,18 @@ void test_percpu_inc(void)
 
        sum = 0;
        for (i = 0; i < CPU_SETSIZE; i++)
-               sum += data.c[i].count;
+               sum += rseq_percpu_ptr(data, i)->count;
 
        assert(sum == (uint64_t)opt_reps * num_threads);
+       rseq_percpu_free(data);
+       ret = rseq_mempool_destroy(mempool);
+       if (ret) {
+               perror("rseq_mempool_destroy");
+               abort();
+       }
 }
 
-void this_cpu_list_push(struct percpu_list *list,
+static void this_cpu_list_push(struct percpu_list __rseq_percpu *list,
                        struct percpu_list_node *node,
                        int *_cpu)
 {
@@ -520,15 +653,18 @@ void this_cpu_list_push(struct percpu_list *list,
 
        for (;;) {
                intptr_t *targetptr, newval, expect;
+               struct percpu_list *cpulist;
                int ret;
 
-               cpu = rseq_cpu_start();
+               cpu = get_current_cpu_id();
+               cpulist = rseq_percpu_ptr(list, cpu);
                /* Load list->c[cpu].head with single-copy atomicity. */
-               expect = (intptr_t)RSEQ_READ_ONCE(list->c[cpu].head);
+               expect = (intptr_t)RSEQ_READ_ONCE(cpulist->head);
                newval = (intptr_t)node;
-               targetptr = (intptr_t *)&list->c[cpu].head;
+               targetptr = (intptr_t *)&cpulist->head;
                node->next = (struct percpu_list_node *)expect;
-               ret = rseq_cmpeqv_storev(targetptr, expect, newval, cpu);
+               ret = rseq_load_cbne_store__ptr(RSEQ_MO_RELAXED, RSEQ_PERCPU,
+                                        targetptr, expect, newval, cpu);
                if (rseq_likely(!ret))
                        break;
                /* Retry if comparison fails or rseq aborts. */
@@ -542,7 +678,7 @@ void this_cpu_list_push(struct percpu_list *list,
  * rseq primitive allows us to implement pop without concerns over
  * ABA-type races.
  */
-struct percpu_list_node *this_cpu_list_pop(struct percpu_list *list,
+static struct percpu_list_node *this_cpu_list_pop(struct percpu_list __rseq_percpu *list,
                                           int *_cpu)
 {
        struct percpu_list_node *node = NULL;
@@ -551,16 +687,19 @@ struct percpu_list_node *this_cpu_list_pop(struct percpu_list *list,
        for (;;) {
                struct percpu_list_node *head;
                intptr_t *targetptr, expectnot, *load;
-               off_t offset;
+               struct percpu_list *cpulist;
+               long offset;
                int ret;
 
-               cpu = rseq_cpu_start();
-               targetptr = (intptr_t *)&list->c[cpu].head;
+               cpu = get_current_cpu_id();
+               cpulist = rseq_percpu_ptr(list, cpu);
+               targetptr = (intptr_t *)&cpulist->head;
                expectnot = (intptr_t)NULL;
                offset = offsetof(struct percpu_list_node, next);
                load = (intptr_t *)&head;
-               ret = rseq_cmpnev_storeoffp_load(targetptr, expectnot,
-                                                  offset, load, cpu);
+               ret = rseq_load_cbeq_store_add_load_store__ptr(RSEQ_MO_RELAXED, RSEQ_PERCPU,
+                                                targetptr, expectnot,
+                                                offset, load, cpu);
                if (rseq_likely(!ret)) {
                        node = head;
                        break;
@@ -578,21 +717,22 @@ struct percpu_list_node *this_cpu_list_pop(struct percpu_list *list,
  * __percpu_list_pop is not safe against concurrent accesses. Should
  * only be used on lists that are not concurrently modified.
  */
-struct percpu_list_node *__percpu_list_pop(struct percpu_list *list, int cpu)
+static struct percpu_list_node *__percpu_list_pop(struct percpu_list __rseq_percpu *list, int cpu)
 {
+       struct percpu_list *cpulist = rseq_percpu_ptr(list, cpu);
        struct percpu_list_node *node;
 
-       node = list->c[cpu].head;
+       node = cpulist->head;
        if (!node)
                return NULL;
-       list->c[cpu].head = node->next;
+       cpulist->head = node->next;
        return node;
 }
 
-void *test_percpu_list_thread(void *arg)
+static void *test_percpu_list_thread(void *arg)
 {
        long long i, reps;
-       struct percpu_list *list = (struct percpu_list *)arg;
+       struct percpu_list __rseq_percpu *list = (struct percpu_list __rseq_percpu *)arg;
 
        if (!opt_disable_rseq && rseq_register_current_thread())
                abort();
@@ -617,23 +757,35 @@ void *test_percpu_list_thread(void *arg)
 }
 
 /* Simultaneous modification to a per-cpu linked list from many threads.  */
-void test_percpu_list(void)
+static void test_percpu_list(void)
 {
        const int num_threads = opt_threads;
        int i, j, ret;
        uint64_t sum = 0, expected_sum = 0;
-       struct percpu_list list;
+       struct percpu_list __rseq_percpu *list;
        pthread_t test_threads[num_threads];
        cpu_set_t allowed_cpus;
+       struct rseq_mempool *mempool;
 
-       memset(&list, 0, sizeof(list));
+       mempool = rseq_mempool_create("percpu_list", sizeof(struct percpu_list),
+                       0, CPU_SETSIZE, NULL);
+       if (!mempool) {
+               perror("rseq_mempool_create");
+               abort();
+       }
+       list = (struct percpu_list __rseq_percpu *)rseq_percpu_zmalloc(mempool);
+       if (!list) {
+               perror("rseq_percpu_zmalloc");
+               abort();
+       }
 
        /* Generate list entries for every usable cpu. */
        sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
        for (i = 0; i < CPU_SETSIZE; i++) {
-               if (!CPU_ISSET(i, &allowed_cpus))
+               if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus))
                        continue;
                for (j = 1; j <= 100; j++) {
+                       struct percpu_list *cpulist = rseq_percpu_ptr(list, i);
                        struct percpu_list_node *node;
 
                        expected_sum += j;
@@ -641,14 +793,14 @@ void test_percpu_list(void)
                        node = (struct percpu_list_node *) malloc(sizeof(*node));
                        assert(node);
                        node->data = j;
-                       node->next = list.c[i].head;
-                       list.c[i].head = node;
+                       node->next = cpulist->head;
+                       cpulist->head = node;
                }
        }
 
        for (i = 0; i < num_threads; i++) {
                ret = pthread_create(&test_threads[i], NULL,
-                                    test_percpu_list_thread, &list);
+                                    test_percpu_list_thread, list);
                if (ret) {
                        errno = ret;
                        perror("pthread_create");
@@ -668,10 +820,10 @@ void test_percpu_list(void)
        for (i = 0; i < CPU_SETSIZE; i++) {
                struct percpu_list_node *node;
 
-               if (!CPU_ISSET(i, &allowed_cpus))
+               if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus))
                        continue;
 
-               while ((node = __percpu_list_pop(&list, i))) {
+               while ((node = __percpu_list_pop(list, i))) {
                        sum += node->data;
                        free(node);
                }
@@ -683,9 +835,15 @@ void test_percpu_list(void)
         * test is running).
         */
        assert(sum == expected_sum);
+       rseq_percpu_free(list);
+       ret = rseq_mempool_destroy(mempool);
+       if (ret) {
+               perror("rseq_mempool_destroy");
+               abort();
+       }
 }
 
-bool this_cpu_buffer_push(struct percpu_buffer *buffer,
+static bool this_cpu_buffer_push(struct percpu_buffer __rseq_percpu *buffer,
                          struct percpu_buffer_node *node,
                          int *_cpu)
 {
@@ -693,27 +851,24 @@ bool this_cpu_buffer_push(struct percpu_buffer *buffer,
        int cpu;
 
        for (;;) {
+               struct percpu_buffer *cpubuffer;
                intptr_t *targetptr_spec, newval_spec;
                intptr_t *targetptr_final, newval_final;
                intptr_t offset;
                int ret;
 
-               cpu = rseq_cpu_start();
-               offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
-               if (offset == buffer->c[cpu].buflen)
+               cpu = get_current_cpu_id();
+               cpubuffer = rseq_percpu_ptr(buffer, cpu);
+               offset = RSEQ_READ_ONCE(cpubuffer->offset);
+               if (offset == cpubuffer->buflen)
                        break;
                newval_spec = (intptr_t)node;
-               targetptr_spec = (intptr_t *)&buffer->c[cpu].array[offset];
+               targetptr_spec = (intptr_t *)&cpubuffer->array[offset];
                newval_final = offset + 1;
-               targetptr_final = &buffer->c[cpu].offset;
-               if (opt_mb)
-                       ret = rseq_cmpeqv_trystorev_storev_release(
-                               targetptr_final, offset, targetptr_spec,
-                               newval_spec, newval_final, cpu);
-               else
-                       ret = rseq_cmpeqv_trystorev_storev(targetptr_final,
-                               offset, targetptr_spec, newval_spec,
-                               newval_final, cpu);
+               targetptr_final = &cpubuffer->offset;
+               ret = rseq_load_cbne_store_store__ptr(opt_mo, RSEQ_PERCPU,
+                       targetptr_final, offset, targetptr_spec,
+                       newval_spec, newval_final, cpu);
                if (rseq_likely(!ret)) {
                        result = true;
                        break;
@@ -725,29 +880,32 @@ bool this_cpu_buffer_push(struct percpu_buffer *buffer,
        return result;
 }
 
-struct percpu_buffer_node *this_cpu_buffer_pop(struct percpu_buffer *buffer,
+static struct percpu_buffer_node *this_cpu_buffer_pop(struct percpu_buffer __rseq_percpu *buffer,
                                               int *_cpu)
 {
        struct percpu_buffer_node *head;
        int cpu;
 
        for (;;) {
+               struct percpu_buffer *cpubuffer;
                intptr_t *targetptr, newval;
                intptr_t offset;
                int ret;
 
-               cpu = rseq_cpu_start();
+               cpu = get_current_cpu_id();
+               cpubuffer = rseq_percpu_ptr(buffer, cpu);
                /* Load offset with single-copy atomicity. */
-               offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+               offset = RSEQ_READ_ONCE(cpubuffer->offset);
                if (offset == 0) {
                        head = NULL;
                        break;
                }
-               head = RSEQ_READ_ONCE(buffer->c[cpu].array[offset - 1]);
+               head = RSEQ_READ_ONCE(cpubuffer->array[offset - 1]);
                newval = offset - 1;
-               targetptr = (intptr_t *)&buffer->c[cpu].offset;
-               ret = rseq_cmpeqv_cmpeqv_storev(targetptr, offset,
-                       (intptr_t *)&buffer->c[cpu].array[offset - 1],
+               targetptr = (intptr_t *)&cpubuffer->offset;
+               ret = rseq_load_cbne_load_cbne_store__ptr(RSEQ_MO_RELAXED, RSEQ_PERCPU,
+                       targetptr, offset,
+                       (intptr_t *)&cpubuffer->array[offset - 1],
                        (intptr_t)head, newval, cpu);
                if (rseq_likely(!ret))
                        break;
@@ -762,24 +920,26 @@ struct percpu_buffer_node *this_cpu_buffer_pop(struct percpu_buffer *buffer,
  * __percpu_buffer_pop is not safe against concurrent accesses. Should
  * only be used on buffers that are not concurrently modified.
  */
-struct percpu_buffer_node *__percpu_buffer_pop(struct percpu_buffer *buffer,
+static struct percpu_buffer_node *__percpu_buffer_pop(struct percpu_buffer __rseq_percpu *buffer,
                                               int cpu)
 {
+       struct percpu_buffer *cpubuffer;
        struct percpu_buffer_node *head;
        intptr_t offset;
 
-       offset = buffer->c[cpu].offset;
+       cpubuffer = rseq_percpu_ptr(buffer, cpu);
+       offset = cpubuffer->offset;
        if (offset == 0)
                return NULL;
-       head = buffer->c[cpu].array[offset - 1];
-       buffer->c[cpu].offset = offset - 1;
+       head = cpubuffer->array[offset - 1];
+       cpubuffer->offset = offset - 1;
        return head;
 }
 
-void *test_percpu_buffer_thread(void *arg)
+static void *test_percpu_buffer_thread(void *arg)
 {
        long long i, reps;
-       struct percpu_buffer *buffer = (struct percpu_buffer *)arg;
+       struct percpu_buffer __rseq_percpu *buffer = (struct percpu_buffer __rseq_percpu *)arg;
 
        if (!opt_disable_rseq && rseq_register_current_thread())
                abort();
@@ -808,29 +968,43 @@ void *test_percpu_buffer_thread(void *arg)
 }
 
 /* Simultaneous modification to a per-cpu buffer from many threads.  */
-void test_percpu_buffer(void)
+static void test_percpu_buffer(void)
 {
        const int num_threads = opt_threads;
        int i, j, ret;
        uint64_t sum = 0, expected_sum = 0;
-       struct percpu_buffer buffer;
+       struct percpu_buffer __rseq_percpu *buffer;
        pthread_t test_threads[num_threads];
        cpu_set_t allowed_cpus;
+       struct rseq_mempool *mempool;
 
-       memset(&buffer, 0, sizeof(buffer));
+       mempool = rseq_mempool_create("percpu_buffer", sizeof(struct percpu_buffer),
+                       0, CPU_SETSIZE, NULL);
+       if (!mempool) {
+               perror("rseq_mempool_create");
+               abort();
+       }
+       buffer = (struct percpu_buffer __rseq_percpu *)rseq_percpu_zmalloc(mempool);
+       if (!buffer) {
+               perror("rseq_percpu_zmalloc");
+               abort();
+       }
 
        /* Generate list entries for every usable cpu. */
        sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
        for (i = 0; i < CPU_SETSIZE; i++) {
-               if (!CPU_ISSET(i, &allowed_cpus))
+               struct percpu_buffer *cpubuffer;
+
+               if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus))
                        continue;
+               cpubuffer = rseq_percpu_ptr(buffer, i);
                /* Worse-case is every item in same CPU. */
-               buffer.c[i].array =
+               cpubuffer->array =
                        (struct percpu_buffer_node **)
-                       malloc(sizeof(*buffer.c[i].array) * CPU_SETSIZE *
+                       malloc(sizeof(*cpubuffer->array) * CPU_SETSIZE *
                               BUFFER_ITEM_PER_CPU);
-               assert(buffer.c[i].array);
-               buffer.c[i].buflen = CPU_SETSIZE * BUFFER_ITEM_PER_CPU;
+               assert(cpubuffer->array);
+               cpubuffer->buflen = CPU_SETSIZE * BUFFER_ITEM_PER_CPU;
                for (j = 1; j <= BUFFER_ITEM_PER_CPU; j++) {
                        struct percpu_buffer_node *node;
 
@@ -846,14 +1020,14 @@ void test_percpu_buffer(void)
                        node = (struct percpu_buffer_node *) malloc(sizeof(*node));
                        assert(node);
                        node->data = j;
-                       buffer.c[i].array[j - 1] = node;
-                       buffer.c[i].offset++;
+                       cpubuffer->array[j - 1] = node;
+                       cpubuffer->offset++;
                }
        }
 
        for (i = 0; i < num_threads; i++) {
                ret = pthread_create(&test_threads[i], NULL,
-                                    test_percpu_buffer_thread, &buffer);
+                                    test_percpu_buffer_thread, buffer);
                if (ret) {
                        errno = ret;
                        perror("pthread_create");
@@ -871,16 +1045,18 @@ void test_percpu_buffer(void)
        }
 
        for (i = 0; i < CPU_SETSIZE; i++) {
+               struct percpu_buffer *cpubuffer;
                struct percpu_buffer_node *node;
 
-               if (!CPU_ISSET(i, &allowed_cpus))
+               if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus))
                        continue;
 
-               while ((node = __percpu_buffer_pop(&buffer, i))) {
+               cpubuffer = rseq_percpu_ptr(buffer, i);
+               while ((node = __percpu_buffer_pop(buffer, i))) {
                        sum += node->data;
                        free(node);
                }
-               free(buffer.c[i].array);
+               free(cpubuffer->array);
        }
 
        /*
@@ -889,9 +1065,15 @@ void test_percpu_buffer(void)
         * test is running).
         */
        assert(sum == expected_sum);
+       rseq_percpu_free(buffer);
+       ret = rseq_mempool_destroy(mempool);
+       if (ret) {
+               perror("rseq_mempool_destroy");
+               abort();
+       }
 }
 
-bool this_cpu_memcpy_buffer_push(struct percpu_memcpy_buffer *buffer,
+static bool this_cpu_memcpy_buffer_push(struct percpu_memcpy_buffer __rseq_percpu *buffer,
                                 struct percpu_memcpy_buffer_node item,
                                 int *_cpu)
 {
@@ -899,31 +1081,29 @@ bool this_cpu_memcpy_buffer_push(struct percpu_memcpy_buffer *buffer,
        int cpu;
 
        for (;;) {
+               struct percpu_memcpy_buffer *cpubuffer;
                intptr_t *targetptr_final, newval_final, offset;
                char *destptr, *srcptr;
                size_t copylen;
                int ret;
 
-               cpu = rseq_cpu_start();
+               cpu = get_current_cpu_id();
+               cpubuffer = rseq_percpu_ptr(buffer, cpu);
                /* Load offset with single-copy atomicity. */
-               offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
-               if (offset == buffer->c[cpu].buflen)
+               offset = RSEQ_READ_ONCE(cpubuffer->offset);
+               if (offset == cpubuffer->buflen)
                        break;
-               destptr = (char *)&buffer->c[cpu].array[offset];
+               destptr = (char *)&cpubuffer->array[offset];
                srcptr = (char *)&item;
                /* copylen must be <= 4kB. */
                copylen = sizeof(item);
                newval_final = offset + 1;
-               targetptr_final = &buffer->c[cpu].offset;
-               if (opt_mb)
-                       ret = rseq_cmpeqv_trymemcpy_storev_release(
-                               targetptr_final, offset,
-                               destptr, srcptr, copylen,
-                               newval_final, cpu);
-               else
-                       ret = rseq_cmpeqv_trymemcpy_storev(targetptr_final,
-                               offset, destptr, srcptr, copylen,
-                               newval_final, cpu);
+               targetptr_final = &cpubuffer->offset;
+               ret = rseq_load_cbne_memcpy_store__ptr(
+                       opt_mo, RSEQ_PERCPU,
+                       targetptr_final, offset,
+                       destptr, srcptr, copylen,
+                       newval_final, cpu);
                if (rseq_likely(!ret)) {
                        result = true;
                        break;
@@ -935,7 +1115,7 @@ bool this_cpu_memcpy_buffer_push(struct percpu_memcpy_buffer *buffer,
        return result;
 }
 
-bool this_cpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
+static bool this_cpu_memcpy_buffer_pop(struct percpu_memcpy_buffer __rseq_percpu *buffer,
                                struct percpu_memcpy_buffer_node *item,
                                int *_cpu)
 {
@@ -943,24 +1123,26 @@ bool this_cpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
        int cpu;
 
        for (;;) {
+               struct percpu_memcpy_buffer *cpubuffer;
                intptr_t *targetptr_final, newval_final, offset;
                char *destptr, *srcptr;
                size_t copylen;
                int ret;
 
-               cpu = rseq_cpu_start();
+               cpu = get_current_cpu_id();
+               cpubuffer = rseq_percpu_ptr(buffer, cpu);
                /* Load offset with single-copy atomicity. */
-               offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+               offset = RSEQ_READ_ONCE(cpubuffer->offset);
                if (offset == 0)
                        break;
                destptr = (char *)item;
-               srcptr = (char *)&buffer->c[cpu].array[offset - 1];
+               srcptr = (char *)&cpubuffer->array[offset - 1];
                /* copylen must be <= 4kB. */
                copylen = sizeof(*item);
                newval_final = offset - 1;
-               targetptr_final = &buffer->c[cpu].offset;
-               ret = rseq_cmpeqv_trymemcpy_storev(targetptr_final,
-                       offset, destptr, srcptr, copylen,
+               targetptr_final = &cpubuffer->offset;
+               ret = rseq_load_cbne_memcpy_store__ptr(RSEQ_MO_RELAXED, RSEQ_PERCPU,
+                       targetptr_final, offset, destptr, srcptr, copylen,
                        newval_final, cpu);
                if (rseq_likely(!ret)) {
                        result = true;
@@ -977,24 +1159,26 @@ bool this_cpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
  * __percpu_memcpy_buffer_pop is not safe against concurrent accesses. Should
  * only be used on buffers that are not concurrently modified.
  */
-bool __percpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
+static bool __percpu_memcpy_buffer_pop(struct percpu_memcpy_buffer __rseq_percpu *buffer,
                                struct percpu_memcpy_buffer_node *item,
                                int cpu)
 {
+       struct percpu_memcpy_buffer *cpubuffer;
        intptr_t offset;
 
-       offset = buffer->c[cpu].offset;
+       cpubuffer = rseq_percpu_ptr(buffer, cpu);
+       offset = cpubuffer->offset;
        if (offset == 0)
                return false;
-       memcpy(item, &buffer->c[cpu].array[offset - 1], sizeof(*item));
-       buffer->c[cpu].offset = offset - 1;
+       memcpy(item, &cpubuffer->array[offset - 1], sizeof(*item));
+       cpubuffer->offset = offset - 1;
        return true;
 }
 
-void *test_percpu_memcpy_buffer_thread(void *arg)
+static void *test_percpu_memcpy_buffer_thread(void *arg)
 {
        long long i, reps;
-       struct percpu_memcpy_buffer *buffer = (struct percpu_memcpy_buffer *)arg;
+       struct percpu_memcpy_buffer __rseq_percpu *buffer = (struct percpu_memcpy_buffer __rseq_percpu *)arg;
 
        if (!opt_disable_rseq && rseq_register_current_thread())
                abort();
@@ -1024,29 +1208,44 @@ void *test_percpu_memcpy_buffer_thread(void *arg)
 }
 
 /* Simultaneous modification to a per-cpu buffer from many threads.  */
-void test_percpu_memcpy_buffer(void)
+static void test_percpu_memcpy_buffer(void)
 {
        const int num_threads = opt_threads;
        int i, j, ret;
        uint64_t sum = 0, expected_sum = 0;
-       struct percpu_memcpy_buffer buffer;
+       struct percpu_memcpy_buffer *buffer;
        pthread_t test_threads[num_threads];
        cpu_set_t allowed_cpus;
+       struct rseq_mempool *mempool;
 
-       memset(&buffer, 0, sizeof(buffer));
+       mempool = rseq_mempool_create("percpu_memcpy_buffer",
+                       sizeof(struct percpu_memcpy_buffer),
+                       0, CPU_SETSIZE, NULL);
+       if (!mempool) {
+               perror("rseq_mempool_create");
+               abort();
+       }
+       buffer = (struct percpu_memcpy_buffer __rseq_percpu *)rseq_percpu_zmalloc(mempool);
+       if (!buffer) {
+               perror("rseq_percpu_zmalloc");
+               abort();
+       }
 
        /* Generate list entries for every usable cpu. */
        sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
        for (i = 0; i < CPU_SETSIZE; i++) {
-               if (!CPU_ISSET(i, &allowed_cpus))
+               struct percpu_memcpy_buffer *cpubuffer;
+
+               if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus))
                        continue;
+               cpubuffer = rseq_percpu_ptr(buffer, i);
                /* Worse-case is every item in same CPU. */
-               buffer.c[i].array =
+               cpubuffer->array =
                        (struct percpu_memcpy_buffer_node *)
-                       malloc(sizeof(*buffer.c[i].array) * CPU_SETSIZE *
+                       malloc(sizeof(*cpubuffer->array) * CPU_SETSIZE *
                               MEMCPY_BUFFER_ITEM_PER_CPU);
-               assert(buffer.c[i].array);
-               buffer.c[i].buflen = CPU_SETSIZE * MEMCPY_BUFFER_ITEM_PER_CPU;
+               assert(cpubuffer->array);
+               cpubuffer->buflen = CPU_SETSIZE * MEMCPY_BUFFER_ITEM_PER_CPU;
                for (j = 1; j <= MEMCPY_BUFFER_ITEM_PER_CPU; j++) {
                        expected_sum += 2 * j + 1;
 
@@ -1057,16 +1256,16 @@ void test_percpu_memcpy_buffer(void)
                         * within a single word, so allocate an object
                         * for each node.
                         */
-                       buffer.c[i].array[j - 1].data1 = j;
-                       buffer.c[i].array[j - 1].data2 = j + 1;
-                       buffer.c[i].offset++;
+                       cpubuffer->array[j - 1].data1 = j;
+                       cpubuffer->array[j - 1].data2 = j + 1;
+                       cpubuffer->offset++;
                }
        }
 
        for (i = 0; i < num_threads; i++) {
                ret = pthread_create(&test_threads[i], NULL,
                                     test_percpu_memcpy_buffer_thread,
-                                    &buffer);
+                                    buffer);
                if (ret) {
                        errno = ret;
                        perror("pthread_create");
@@ -1085,15 +1284,17 @@ void test_percpu_memcpy_buffer(void)
 
        for (i = 0; i < CPU_SETSIZE; i++) {
                struct percpu_memcpy_buffer_node item;
+               struct percpu_memcpy_buffer *cpubuffer;
 
-               if (!CPU_ISSET(i, &allowed_cpus))
+               if (rseq_use_cpu_index() && !CPU_ISSET(i, &allowed_cpus))
                        continue;
 
-               while (__percpu_memcpy_buffer_pop(&buffer, &item, i)) {
+               cpubuffer = rseq_percpu_ptr(buffer, i);
+               while (__percpu_memcpy_buffer_pop(buffer, &item, i)) {
                        sum += item.data1;
                        sum += item.data2;
                }
-               free(buffer.c[i].array);
+               free(cpubuffer->array);
        }
 
        /*
@@ -1102,9 +1303,14 @@ void test_percpu_memcpy_buffer(void)
         * test is running).
         */
        assert(sum == expected_sum);
+       rseq_percpu_free(buffer);
+       ret = rseq_mempool_destroy(mempool);
+       if (ret) {
+               perror("rseq_mempool_destroy");
+               abort();
+       }
 }
 
-
 static void test_signal_interrupt_handler(__attribute__ ((unused)) int signo)
 {
        signals_delivered++;
@@ -1136,6 +1342,297 @@ static int set_signal_handler(void)
        return ret;
 }
 
+static
+bool membarrier_private_expedited_rseq_available(void)
+{
+       int status = sys_membarrier(MEMBARRIER_CMD_QUERY, 0, 0);
+
+       if (status < 0) {
+               perror("membarrier");
+               return false;
+       }
+       if (!(status & MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ))
+               return false;
+       return true;
+}
+
+/* Test MEMBARRIER_CMD_PRIVATE_RESTART_RSEQ_ON_CPU membarrier command. */
+#ifdef TEST_MEMBARRIER
+struct test_membarrier_thread_args {
+       struct rseq_mempool *mempool;
+       struct percpu_list __rseq_percpu *percpu_list_ptr;
+       int stop;
+};
+
+/* Worker threads modify data in their "active" percpu lists. */
+static
+void *test_membarrier_worker_thread(void *arg)
+{
+       struct test_membarrier_thread_args *args =
+               (struct test_membarrier_thread_args *)arg;
+       const long long iters = opt_reps;
+       long long i;
+
+       if (rseq_register_current_thread()) {
+               fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
+                       errno, strerror(errno));
+               abort();
+       }
+
+       /* Wait for initialization. */
+       while (!rseq_smp_load_acquire(&args->percpu_list_ptr)) { }
+
+       for (i = 0; i < iters; ++i) {
+               int ret;
+
+               do {
+                       int cpu = get_current_cpu_id();
+                       struct percpu_list __rseq_percpu *list = RSEQ_READ_ONCE(args->percpu_list_ptr);
+                       struct percpu_list *cpulist = rseq_percpu_ptr(list, cpu);
+
+                       ret = rseq_load_cbne_load_add_load_add_store__ptr(RSEQ_MO_RELAXED, RSEQ_PERCPU,
+                               (intptr_t *) &args->percpu_list_ptr,
+                               (intptr_t) list, (intptr_t *) &cpulist->head, 0, 1, cpu);
+               } while (rseq_unlikely(ret));
+       }
+
+       if (rseq_unregister_current_thread()) {
+               fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
+                       errno, strerror(errno));
+               abort();
+       }
+       return NULL;
+}
+
+static
+struct percpu_list __rseq_percpu *test_membarrier_alloc_percpu_list(struct rseq_mempool *mempool)
+{
+       struct percpu_list __rseq_percpu *list;
+       int i;
+
+       list = (struct percpu_list __rseq_percpu *)rseq_percpu_zmalloc(mempool);
+       if (!list) {
+               perror("rseq_percpu_zmalloc");
+               return NULL;
+       }
+       for (i = 0; i < CPU_SETSIZE; i++) {
+               struct percpu_list *cpulist = rseq_percpu_ptr(list, i);
+               struct percpu_list_node *node;
+
+               node = (struct percpu_list_node *) malloc(sizeof(*node));
+               assert(node);
+               node->data = 0;
+               node->next = NULL;
+               cpulist->head = node;
+       }
+       return list;
+}
+
+static
+void test_membarrier_free_percpu_list(struct percpu_list __rseq_percpu *list)
+{
+       int i;
+
+       for (i = 0; i < CPU_SETSIZE; i++)
+               free(rseq_percpu_ptr(list, i)->head);
+       rseq_percpu_free(list);
+}
+
+static
+long long test_membarrier_count_percpu_list(struct percpu_list __rseq_percpu *list)
+{
+       long long total_count = 0;
+       int i;
+
+       for (i = 0; i < CPU_SETSIZE; i++)
+               total_count += rseq_percpu_ptr(list, i)->head->data;
+       return total_count;
+}
+
+/*
+ * The manager thread swaps per-cpu lists that worker threads see,
+ * and validates that there are no unexpected modifications.
+ */
+static
+void *test_membarrier_manager_thread(void *arg)
+{
+       struct test_membarrier_thread_args *args =
+               (struct test_membarrier_thread_args *)arg;
+       struct percpu_list __rseq_percpu *list_a, __rseq_percpu *list_b;
+       intptr_t expect_a = 0, expect_b = 0;
+       int cpu_a = 0, cpu_b = 0;
+       struct rseq_mempool *mempool;
+       int ret;
+       long long total_count = 0;
+
+       mempool = rseq_mempool_create("percpu_list", sizeof(struct percpu_list),
+                       0, CPU_SETSIZE, NULL);
+       if (!mempool) {
+               perror("rseq_mempool_create");
+               abort();
+       }
+       args->mempool = mempool;
+
+       if (rseq_register_current_thread()) {
+               fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
+                       errno, strerror(errno));
+               abort();
+       }
+
+       /* Init lists. */
+       list_a = test_membarrier_alloc_percpu_list(mempool);
+       assert(list_a);
+       list_b = test_membarrier_alloc_percpu_list(mempool);
+       assert(list_b);
+
+       /* Initialize lists before publishing them. */
+       rseq_smp_wmb();
+
+       RSEQ_WRITE_ONCE(args->percpu_list_ptr, list_a);
+
+       while (!RSEQ_READ_ONCE(args->stop)) {
+               /* list_a is "active". */
+               cpu_a = rand() % CPU_SETSIZE;
+               /*
+                * As list_b is "inactive", we should never see changes
+                * to list_b.
+                */
+               if (expect_b != RSEQ_READ_ONCE(rseq_percpu_ptr(list_b, cpu_b)->head->data)) {
+                       fprintf(stderr, "Membarrier test failed\n");
+                       abort();
+               }
+
+               /* Make list_b "active". */
+               RSEQ_WRITE_ONCE(args->percpu_list_ptr, list_b);
+               if (rseq_membarrier_expedited(cpu_a) &&
+                               errno != ENXIO /* missing CPU */) {
+                       perror("sys_membarrier");
+                       abort();
+               }
+               /*
+                * Cpu A should now only modify list_b, so the values
+                * in list_a should be stable.
+                */
+               expect_a = RSEQ_READ_ONCE(rseq_percpu_ptr(list_a, cpu_a)->head->data);
+
+               cpu_b = rand() % CPU_SETSIZE;
+               /*
+                * As list_a is "inactive", we should never see changes
+                * to list_a.
+                */
+               if (expect_a != RSEQ_READ_ONCE(rseq_percpu_ptr(list_a, cpu_a)->head->data)) {
+                       fprintf(stderr, "Membarrier test failed\n");
+                       abort();
+               }
+
+               /* Make list_a "active". */
+               RSEQ_WRITE_ONCE(args->percpu_list_ptr, list_a);
+               if (rseq_membarrier_expedited(cpu_b) &&
+                               errno != ENXIO /* missing CPU */) {
+                       perror("sys_membarrier");
+                       abort();
+               }
+               /* Remember a value from list_b. */
+               expect_b = RSEQ_READ_ONCE(rseq_percpu_ptr(list_b, cpu_b)->head->data);
+       }
+
+       total_count += test_membarrier_count_percpu_list(list_a);
+       total_count += test_membarrier_count_percpu_list(list_b);
+
+       /* Validate that we observe the right number of increments. */
+       if (total_count != opt_threads * opt_reps) {
+               fprintf(stderr, "Error: Observed %lld increments, expected %lld\n",
+                       total_count, opt_threads * opt_reps);
+               abort();
+       }
+       test_membarrier_free_percpu_list(list_a);
+       test_membarrier_free_percpu_list(list_b);
+
+       if (rseq_unregister_current_thread()) {
+               fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
+                       errno, strerror(errno));
+               abort();
+       }
+       ret = rseq_mempool_destroy(mempool);
+       if (ret) {
+               perror("rseq_mempool_destroy");
+               abort();
+       }
+
+       return NULL;
+}
+
+static
+void test_membarrier(void)
+{
+       const int num_threads = opt_threads;
+       struct test_membarrier_thread_args thread_args;
+       pthread_t worker_threads[num_threads];
+       pthread_t manager_thread;
+       int i, ret;
+
+       if (!membarrier_private_expedited_rseq_available()) {
+               fprintf(stderr, "Membarrier private expedited rseq not available. "
+                               "Skipping membarrier test.\n");
+               return;
+       }
+       if (sys_membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, 0, 0)) {
+               perror("sys_membarrier");
+               abort();
+       }
+
+       thread_args.percpu_list_ptr = NULL;
+       thread_args.stop = 0;
+       ret = pthread_create(&manager_thread, NULL,
+                       test_membarrier_manager_thread, &thread_args);
+       if (ret) {
+               errno = ret;
+               perror("pthread_create");
+               abort();
+       }
+
+       for (i = 0; i < num_threads; i++) {
+               ret = pthread_create(&worker_threads[i], NULL,
+                               test_membarrier_worker_thread, &thread_args);
+               if (ret) {
+                       errno = ret;
+                       perror("pthread_create");
+                       abort();
+               }
+       }
+
+
+       for (i = 0; i < num_threads; i++) {
+               ret = pthread_join(worker_threads[i], NULL);
+               if (ret) {
+                       errno = ret;
+                       perror("pthread_join");
+                       abort();
+               }
+       }
+
+       RSEQ_WRITE_ONCE(thread_args.stop, 1);
+       ret = pthread_join(manager_thread, NULL);
+       if (ret) {
+               errno = ret;
+               perror("pthread_join");
+               abort();
+       }
+}
+#else /* TEST_MEMBARRIER */
+static
+void test_membarrier(void)
+{
+       if (!membarrier_private_expedited_rseq_available()) {
+               fprintf(stderr, "Membarrier private expedited rseq not available. "
+                               "Skipping membarrier test.\n");
+               return;
+       }
+       fprintf(stderr, "rseq_load_cbne_load_add_load_add_store__ptr is not implemented on this architecture. "
+                       "Skipping membarrier test.\n");
+}
+#endif
+
 static void show_usage(char **argv)
 {
        printf("Usage : %s <OPTIONS>\n",
@@ -1158,7 +1655,7 @@ static void show_usage(char **argv)
        printf("        [-r N] Number of repetitions per thread (default 5000)\n");
        printf("        [-d] Disable rseq system call (no initialization)\n");
        printf("        [-D M] Disable rseq for each M threads\n");
-       printf("        [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement\n");
+       printf("        [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement, membarrie(r)\n");
        printf("        [-M] Push into buffer and memcpy buffer with memory barriers.\n");
        printf("        [-c] Check if the rseq syscall is available.\n");
        printf("        [-v] Verbose output.\n");
@@ -1274,6 +1771,7 @@ int main(int argc, char **argv)
                        case 'i':
                        case 'b':
                        case 'm':
+                       case 'r':
                                break;
                        default:
                                show_usage(argv);
@@ -1285,10 +1783,10 @@ int main(int argc, char **argv)
                        verbose = 1;
                        break;
                case 'M':
-                       opt_mb = 1;
+                       opt_mo = RSEQ_MO_RELEASE;
                        break;
                case 'c':
-                       if (rseq_available()) {
+                       if (rseq_available(RSEQ_AVAILABLE_QUERY_KERNEL)) {
                                printf_verbose("The rseq syscall is available.\n");
                                goto end;
                        } else {
@@ -1313,6 +1811,10 @@ int main(int argc, char **argv)
 
        if (!opt_disable_rseq && rseq_register_current_thread())
                goto error;
+       if (!opt_disable_rseq && !rseq_validate_cpu_id()) {
+               printf_verbose("The rseq cpu id getter is unavailable\n");
+               goto no_rseq;
+       }
        switch (opt_test) {
        case 's':
                printf_verbose("spinlock\n");
@@ -1334,6 +1836,10 @@ int main(int argc, char **argv)
                printf_verbose("counter increment\n");
                test_percpu_inc();
                break;
+       case 'r':
+               printf_verbose("membarrier\n");
+               test_membarrier();
+               break;
        }
        if (!opt_disable_rseq && rseq_unregister_current_thread())
                abort();
This page took 0.042675 seconds and 4 git commands to generate.