#define _GNU_SOURCE
#include <stdio.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <signal.h>
#include <poll.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <pthread.h>
#include <linux/userfaultfd.h>
#ifdef __NR_userfaultfd
static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
#define BOUNCE_RANDOM (1<<0)
#define BOUNCE_RACINGFAULTS (1<<1)
#define BOUNCE_VERIFY (1<<2)
#define BOUNCE_POLL (1<<3)
static int bounces;
static unsigned long long *count_verify;
static int uffd, finished, *pipefd;
static char *area_src, *area_dst;
static char *zeropage;
pthread_attr_t attr;
#define area_mutex(___area, ___nr) \
((pthread_mutex_t *) ((___area) + (___nr)*page_size))
#define area_count(___area, ___nr) \
((volatile unsigned long long *) ((unsigned long) \
((___area) + (___nr)*page_size + \
sizeof(pthread_mutex_t) + \
sizeof(unsigned long long) - 1) & \
~(unsigned long)(sizeof(unsigned long long) \
- 1)))
static int my_bcmp(char *str1, char *str2, size_t n)
{
unsigned long i;
for (i = 0; i < n; i++)
if (str1[i] != str2[i])
return 1;
return 0;
}
static void *locking_thread(void *arg)
{
unsigned long cpu = (unsigned long) arg;
struct random_data rand;
unsigned long page_nr = *(&(page_nr));
int32_t rand_nr;
unsigned long long count;
char randstate[64];
unsigned int seed;
time_t start;
if (bounces & BOUNCE_RANDOM) {
seed = (unsigned int) time(NULL) - bounces;
if (!(bounces & BOUNCE_RACINGFAULTS))
seed += cpu;
bzero(&rand, sizeof(rand));
bzero(&randstate, sizeof(randstate));
if (initstate_r(seed, randstate, sizeof(randstate), &rand))
fprintf(stderr, "srandom_r error\n"), exit(1);
} else {
page_nr = -bounces;
if (!(bounces & BOUNCE_RACINGFAULTS))
page_nr += cpu * nr_pages_per_cpu;
}
while (!finished) {
if (bounces & BOUNCE_RANDOM) {
if (random_r(&rand, &rand_nr))
fprintf(stderr, "random_r 1 error\n"), exit(1);
page_nr = rand_nr;
if (sizeof(page_nr) > sizeof(rand_nr)) {
if (random_r(&rand, &rand_nr))
fprintf(stderr, "random_r 2 error\n"), exit(1);
page_nr |= (((unsigned long) rand_nr) << 16) <<
16;
}
} else
page_nr += 1;
page_nr %= nr_pages;
start = time(NULL);
if (bounces & BOUNCE_VERIFY) {
count = *area_count(area_dst, page_nr);
if (!count)
fprintf(stderr,
"page_nr %lu wrong count %Lu %Lu\n",
page_nr, count,
count_verify[page_nr]), exit(1);
#if 1
if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
page_size))
fprintf(stderr,
"my_bcmp page_nr %lu wrong count %Lu %Lu\n",
page_nr, count,
count_verify[page_nr]), exit(1);
#else#endif
}
pthread_mutex_lock(area_mutex(area_dst, page_nr));
count = *area_count(area_dst, page_nr);
if (count != count_verify[page_nr]) {
fprintf(stderr,
"page_nr %lu memory corruption %Lu %Lu\n",
page_nr, count,
count_verify[page_nr]), exit(1);
}
count++;
*area_count(area_dst, page_nr) = count_verify[page_nr] = count;
pthread_mutex_unlock(area_mutex(area_dst, page_nr));
if (time(NULL) - start > 1)
fprintf(stderr,
"userfault too slow %ld "
"possible false positive with overcommit\n",
time(NULL) - start);
}
return NULL;
}
static int copy_page(unsigned long offset)
{
struct uffdio_copy uffdio_copy;
if (offset >= nr_pages * page_size)
fprintf(stderr, "unexpected offset %lu\n",
offset), exit(1);
uffdio_copy.dst = (unsigned long) area_dst + offset;
uffdio_copy.src = (unsigned long) area_src + offset;
uffdio_copy.len = page_size;
uffdio_copy.mode = 0;
uffdio_copy.copy = 0;
if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy)) {
if (uffdio_copy.copy != -EEXIST)
fprintf(stderr, "UFFDIO_COPY error %Ld\n",
uffdio_copy.copy), exit(1);
} else if (uffdio_copy.copy != page_size) {
fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
uffdio_copy.copy), exit(1);
} else
return 1;
return 0;
}
static void *uffd_poll_thread(void *arg)
{
unsigned long cpu = (unsigned long) arg;
struct pollfd pollfd[2];
struct uffd_msg msg;
int ret;
unsigned long offset;
char tmp_chr;
unsigned long userfaults = 0;
pollfd[0].fd = uffd;
pollfd[0].events = POLLIN;
pollfd[1].fd = pipefd[cpu*2];
pollfd[1].events = POLLIN;
for (;;) {
ret = poll(pollfd, 2, -1);
if (!ret)
fprintf(stderr, "poll error %d\n", ret), exit(1);
if (ret < 0)
perror("poll"), exit(1);
if (pollfd[1].revents & POLLIN) {
if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
fprintf(stderr, "read pipefd error\n"),
exit(1);
break;
}
if (!(pollfd[0].revents & POLLIN))
fprintf(stderr, "pollfd[0].revents %d\n",
pollfd[0].revents), exit(1);
ret = read(uffd, &msg, sizeof(msg));
if (ret < 0) {
if (errno == EAGAIN)
continue;
perror("nonblocking read error"), exit(1);
}
if (msg.event != UFFD_EVENT_PAGEFAULT)
fprintf(stderr, "unexpected msg event %u\n",
msg.event), exit(1);
if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
fprintf(stderr, "unexpected write fault\n"), exit(1);
offset = (char *)(unsigned long)msg.arg.pagefault.address -
area_dst;
offset &= ~(page_size-1);
if (copy_page(offset))
userfaults++;
}
return (void *)userfaults;
}
pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
static void *uffd_read_thread(void *arg)
{
unsigned long *this_cpu_userfaults;
struct uffd_msg msg;
unsigned long offset;
int ret;
this_cpu_userfaults = (unsigned long *) arg;
*this_cpu_userfaults = 0;
pthread_mutex_unlock(&uffd_read_mutex);
for (;;) {
ret = read(uffd, &msg, sizeof(msg));
if (ret != sizeof(msg)) {
if (ret < 0)
perror("blocking read error"), exit(1);
else
fprintf(stderr, "short read\n"), exit(1);
}
if (msg.event != UFFD_EVENT_PAGEFAULT)
fprintf(stderr, "unexpected msg event %u\n",
msg.event), exit(1);
if (bounces & BOUNCE_VERIFY &&
msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
fprintf(stderr, "unexpected write fault\n"), exit(1);
offset = (char *)(unsigned long)msg.arg.pagefault.address -
area_dst;
offset &= ~(page_size-1);
if (copy_page(offset))
(*this_cpu_userfaults)++;
}
return (void *)NULL;
}
static void *background_thread(void *arg)
{
unsigned long cpu = (unsigned long) arg;
unsigned long page_nr;
for (page_nr = cpu * nr_pages_per_cpu;
page_nr < (cpu+1) * nr_pages_per_cpu;
page_nr++)
copy_page(page_nr * page_size);
return NULL;
}
static int stress(unsigned long *userfaults)
{
unsigned long cpu;
pthread_t locking_threads[nr_cpus];
pthread_t uffd_threads[nr_cpus];
pthread_t background_threads[nr_cpus];
void **_userfaults = (void **) userfaults;
finished = 0;
for (cpu = 0; cpu < nr_cpus; cpu++) {
if (pthread_create(&locking_threads[cpu], &attr,
locking_thread, (void *)cpu))
return 1;
if (bounces & BOUNCE_POLL) {
if (pthread_create(&uffd_threads[cpu], &attr,
uffd_poll_thread, (void *)cpu))
return 1;
} else {
if (pthread_create(&uffd_threads[cpu], &attr,
uffd_read_thread,
&_userfaults[cpu]))
return 1;
pthread_mutex_lock(&uffd_read_mutex);
}
if (pthread_create(&background_threads[cpu], &attr,
background_thread, (void *)cpu))
return 1;
}
for (cpu = 0; cpu < nr_cpus; cpu++)
if (pthread_join(background_threads[cpu], NULL))
return 1;
if (madvise(area_src, nr_pages * page_size, MADV_DONTNEED)) {
perror("madvise");
return 1;
}
for (cpu = 0; cpu < nr_cpus; cpu++) {
char c;
if (bounces & BOUNCE_POLL) {
if (write(pipefd[cpu*2+1], &c, 1) != 1) {
fprintf(stderr, "pipefd write error\n");
return 1;
}
if (pthread_join(uffd_threads[cpu], &_userfaults[cpu]))
return 1;
} else {
if (pthread_cancel(uffd_threads[cpu]))
return 1;
if (pthread_join(uffd_threads[cpu], NULL))
return 1;
}
}
finished = 1;
for (cpu = 0; cpu < nr_cpus; cpu++)
if (pthread_join(locking_threads[cpu], NULL))
return 1;
return 0;
}
static int userfaultfd_stress(void)
{
void *area;
char *tmp_area;
unsigned long nr;
struct uffdio_register uffdio_register;
struct uffdio_api uffdio_api;
unsigned long cpu;
int uffd_flags, err;
unsigned long userfaults[nr_cpus];
if (posix_memalign(&area, page_size, nr_pages * page_size)) {
fprintf(stderr, "out of memory\n");
return 1;
}
area_src = area;
if (posix_memalign(&area, page_size, nr_pages * page_size)) {
fprintf(stderr, "out of memory\n");
return 1;
}
area_dst = area;
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd < 0) {
fprintf(stderr,
"userfaultfd syscall not available in this kernel\n");
return 1;
}
uffd_flags = fcntl(uffd, F_GETFD, NULL);
uffdio_api.api = UFFD_API;
uffdio_api.features = 0;
if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
fprintf(stderr, "UFFDIO_API\n");
return 1;
}
if (uffdio_api.api != UFFD_API) {
fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
return 1;
}
count_verify = malloc(nr_pages * sizeof(unsigned long long));
if (!count_verify) {
perror("count_verify");
return 1;
}
for (nr = 0; nr < nr_pages; nr++) {
*area_mutex(area_src, nr) = (pthread_mutex_t)
PTHREAD_MUTEX_INITIALIZER;
count_verify[nr] = *area_count(area_src, nr) = 1;
*(area_count(area_src, nr) + 1) = 1;
}
pipefd = malloc(sizeof(int) * nr_cpus * 2);
if (!pipefd) {
perror("pipefd");
return 1;
}
for (cpu = 0; cpu < nr_cpus; cpu++) {
if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
perror("pipe");
return 1;
}
}
if (posix_memalign(&area, page_size, page_size)) {
fprintf(stderr, "out of memory\n");
return 1;
}
zeropage = area;
bzero(zeropage, page_size);
pthread_mutex_lock(&uffd_read_mutex);
pthread_attr_init(&attr);
pthread_attr_setstacksize(&attr, 16*1024*1024);
err = 0;
while (bounces--) {
unsigned long expected_ioctls;
printf("bounces: %d, mode:", bounces);
if (bounces & BOUNCE_RANDOM)
printf(" rnd");
if (bounces & BOUNCE_RACINGFAULTS)
printf(" racing");
if (bounces & BOUNCE_VERIFY)
printf(" ver");
if (bounces & BOUNCE_POLL)
printf(" poll");
printf(", ");
fflush(stdout);
if (bounces & BOUNCE_POLL)
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
else
fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
uffdio_register.range.start = (unsigned long) area_dst;
uffdio_register.range.len = nr_pages * page_size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
fprintf(stderr, "register failure\n");
return 1;
}
expected_ioctls = (1 << _UFFDIO_WAKE) |
(1 << _UFFDIO_COPY) |
(1 << _UFFDIO_ZEROPAGE);
if ((uffdio_register.ioctls & expected_ioctls) !=
expected_ioctls) {
fprintf(stderr,
"unexpected missing ioctl for anon memory\n");
return 1;
}
if (madvise(area_dst, nr_pages * page_size, MADV_DONTNEED)) {
perror("madvise 2");
return 1;
}
if (stress(userfaults))
return 1;
if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
fprintf(stderr, "register failure\n");
return 1;
}
if (bounces & BOUNCE_VERIFY) {
for (nr = 0; nr < nr_pages; nr++) {
if (*area_count(area_dst, nr) != count_verify[nr]) {
fprintf(stderr,
"error area_count %Lu %Lu %lu\n",
*area_count(area_src, nr),
count_verify[nr],
nr);
err = 1;
bounces = 0;
}
}
}
tmp_area = area_src;
area_src = area_dst;
area_dst = tmp_area;
printf("userfaults:");
for (cpu = 0; cpu < nr_cpus; cpu++)
printf(" %lu", userfaults[cpu]);
printf("\n");
}
return err;
}
int main(int argc, char **argv)
{
if (argc < 3)
fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
page_size = sysconf(_SC_PAGE_SIZE);
if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
> page_size)
fprintf(stderr, "Impossible to run this test\n"), exit(2);
nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
nr_cpus;
if (!nr_pages_per_cpu) {
fprintf(stderr, "invalid MiB\n");
fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
}
bounces = atoi(argv[2]);
if (bounces <= 0) {
fprintf(stderr, "invalid bounces\n");
fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
}
nr_pages = nr_pages_per_cpu * nr_cpus;
printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
nr_pages, nr_pages_per_cpu);
return userfaultfd_stress();
}
#else
#warning "missing __NR_userfaultfd definition"
int main(void)
{
printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
return 0;
}
#endif