#ifdef _KERNEL
#include <sys/types.h>
#include <sys/param.h>
#include <sys/module.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/malloc.h>
#else
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#endif

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/mman.h>
#include <string.h>


static __inline uint64_t
rdtsc(void)
{
        u_int32_t low, high;

        __asm __volatile("rdtsc" : "=a" (low), "=d" (high));
        return (low | ((u_int64_t)high << 32));
}

//#include <machine/cpufunc.h>
//#define rdtscl(low) \
//__asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx")

void	bucket_add(uint64_t);
void	buckets_print(uint64_t);
long	memtime(long *, long, int);

#define	BUCKETS		2000	/* Gives is 2us worth of time */
#define	BINVAL		5	/* Anything within 5ns is the same */

int buckets[BUCKETS];
int ticks;
int binning = 1;

void
bucket_add(uint64_t time)
{

	/* This effectively yields ns */
	time = (time * 1000) / ticks;
	time = (time / binning) * binning;
	if (time > BUCKETS)
		return;
	buckets[time]++;
}

void
buckets_print(uint64_t ticks)
{
	int i;

	for (i = 0; i < BUCKETS; i++)
		if (buckets[i])
			printf("%d\t%d\n", i, buckets[i]);
}

u_int trash;

long
memtime(long *mem, long memsize, int iters)
{
	uint64_t start;
	uint64_t end;
	int tscticks;
	long total;
	long *loc;
	long val;
	int i;

	memset((void *)mem, 0, memsize);
	memset((void *)buckets, 0, sizeof(buckets));
	printf("mem %p, memsize %ld\n", mem, memsize);
	tscticks = 0;
	/*
	 * Fake return val to prevent gcc from optimizing away the load.
	 */
	total = 0;
	for (i = 0; i < 10; i++) {
		__asm("mfence; pause;"); 
		start = rdtsc();
		__asm __volatile("pause;");
		end = rdtsc();
		if (tscticks == 0)
			tscticks = end - start;
		else if (tscticks > (end - start))
			tscticks = end - start;
	}

#ifdef _KERNEL
	ticks = 1800;
#else
	start = rdtsc();
	sleep(1);
	end = rdtsc();
	/*
	 * Convert down to microseconds.
	 */
	ticks = (end - start) / 1000000;
#endif
	printf("ticks per microsecond %d, tsc ticks = %d\n", ticks, tscticks);
	for (; iters > 0; iters--) {
		/*
		 * Fetch the next location and keep the address computation
		 * out of the loop.
		 */
		val = random() % (memsize / sizeof(val));
		loc = mem + val;
		__asm("mfence; pause; pause;");
		start = rdtsc();
		val = *loc;
		__asm __volatile("pause;");
		end = rdtsc();
		val++;
		total += val;
		bucket_add((end - start) - tscticks);
	}
	return (total);
}

#ifdef _KERNEL
#define	MEMSIZE	(4 * 1024 * 1024)
#define	ITERS 100000000
static int
memtime_load(module_t mod, int cmd, void *arg)
{
	register_t s;
	long memsize;
	long *mem;
	int error;

	error = 0;
	switch (cmd) {
	case MOD_LOAD:
		memsize = MEMSIZE;
		mem = malloc(memsize, M_TEMP, M_WAITOK);
		s = intr_disable();
		memtime(mem, memsize, ITERS);
		memtime(mem, memsize, ITERS);
		intr_restore(s);
		buckets_print(ticks);
		free(mem, M_TEMP);
		break;
	case MOD_UNLOAD:
		break;
	default:
		error = EOPNOTSUPP;
		break;
	}
	return (error);
}
DEV_MODULE(memtime, memtime_load, NULL);
#else
int
main(int argc, char **argv)
{
	long memsize;
	long *mem;
	int iters;

	if (argc != 3 && argc != 4) {
		fprintf(stderr, "usage:\n\t%s <memsize> <iterations> [binning]\n",
		    argv[0]);
		exit(EXIT_FAILURE);
	}
	memsize = atoi(argv[1]);
	iters = atoi(argv[2]);
	if (argc == 4)
		binning = atoi(argv[3]);
	if (iters == 0 || memsize == 0) {
		fprintf(stderr, "usage:\n\t%s <memsize> <iterations>\n",
		    argv[0]);
		exit(EXIT_FAILURE);
	}
	mem = malloc(memsize);
	if (mem == NULL) {
		perror("malloc");
		exit(EXIT_FAILURE);
	}
	memtime(mem, memsize, iters);
	buckets_print(ticks);
	exit(EXIT_SUCCESS);
}
#endif

