#include #include #include #include #include #include #include #include #include uint64_t my_fork(void); void __attribute((noinline)) spin(uint64_t loops) { for (volatile uint64_t i = 0; i < loops; i++) { } } struct thing { uint64_t b; uint32_t c; // Making this (plus 'sink' below) uint64_t may make repro take longer? }; volatile struct thing* page; volatile uint32_t sink; int ready; void* thread(void* arg) { __atomic_store_n(&ready, 1, __ATOMIC_SEQ_CST); while (1) { // Spin not strictly required, but it speeds up repro in my case. spin(40*1000); // Atomic not required, this works too: // page->c = sink; __atomic_store_n(&page->c, sink, __ATOMIC_SEQ_CST); sink++; } } int main(void) { page = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (page == MAP_FAILED) { perror("mmap"); return 1; } pthread_t thread_id; int ret = pthread_create(&thread_id, NULL, &thread, NULL); if (ret != 0) { perror("pthread_create"); return 1; } // Wait for child thread to start. // // This is not required to repro, but eliminates racing fork+thread create as // a possibility. while (!__atomic_load_n(&ready, __ATOMIC_SEQ_CST)) { } int64_t i = 0; while (1) { i++; if (i % 10000 == 0) { printf("Loop %d...\n", i); } page->b = 102; // Does not work with libc fork(). libc fork() is significantly slower, // which may be the problem. uint64_t pid = my_fork(); if (pid == 0) { /* Child */ _exit(0); } /* Parent */ /* spin(40*1000); may speed up repro. */ page->b = 2; uint64_t pb = page->b; if (pb != 2) { printf("Corruption! pb, page->b = %lu, %lu\n", pb, page->b); _exit(1); } int status; ret = waitpid(pid, &status, 0); if (ret < 0) { perror("waitpid"); return 1; } if (WEXITSTATUS(status) != 0) { printf("Bad child status %#x\n", status); return 1; } } }