#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <unistd.h>
//#include <sys/types.h>
#define __STDC_FORMAT_MACROS 1
#include <inttypes.h>
#include <string.h>
#include <math.h>

#define LRAND(s) \
(((s) = (s) * 41943011 - 2147483647) >> 32)


//#ifndef MEMSET
//#define MEMSET 1
//#endif




#if defined(__i386__)

static __inline__ uint64_t rdtsc(void)
{
  uint64_t x;
     __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
     return x;
}
#elif defined(__x86_64__)


static __inline__ uint64_t rdtsc(void)
{
  uint32_t hi, lo;
  __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
  return ( (uint64_t)lo)|( ((uint64_t)hi)<<32 );
}

#elif defined(__powerpc__)


static __inline__ uint64_t rdtsc(void)
{
  uint64_t result=0;
  uint32_t upper, lower,tmp;
  __asm__ volatile(
                "0:                  \n"
                "\tmftbu   %0           \n"
                "\tmftb    %1           \n"
                "\tmftbu   %2           \n"
                "\tcmpw    %2,%0        \n"
                "\tbne     0b         \n"
                : "=r"(upper),"=r"(lower),"=r"(tmp)
                );
  result = upper;
  result = result<<32;
  result = result|lower;

  return(result);
}

#endif

int main(int argc, char **argv){
	FILE *out;
	uint64_t clocks1, clocks2;
	uint64_t total;
	uint32_t num1, num2;
	uint32_t reruns = 32769;
	size_t size = 0;
	size_t max_size = 32768;
	void *dest;
	void *src;
	uint64_t rseed = rdtsc();
	double tmp1 = 262144;

	if(argc != 2)
		exit(1);
	//char *file_data_in  = (char *)malloc( sizeof(char) * strlen(argv[1]) + 1 );
	//(void)memset( (void *)file_data_in, '\0', strlen(argv[1]) + 1);
	//(void)strcpy(file_data_in, (const char *)argv[1]);

	out = fopen(argv[1], "w");

	dest = malloc(max_size);
	src = malloc(max_size);
	for(num2=0; num2<max_size; num2++){
		*(((char *)src)+num2) = (char)(LRAND(rseed) % 256);
		*(((char *)dest)+num2) = (char)(LRAND(rseed) % 256);
	}

	for(size=1; size<max_size+1; size++){
		total = 0;
		//tmp1 = 32768
		reruns = (uint32_t)ceil(tmp1/max_size);
		for(num1=0; num1<reruns; num1++){

			clocks1 = rdtsc();
#ifndef MEMSET
			(void)__builtin_memcpy(dest, src, size);
#else
			(void)__builtin_memset(dest, num1, size);
#endif
			clocks2 = rdtsc();
			total += clocks2-clocks1;
		}
		total /= reruns;
		//reruns--;
		fprintf(out, "%"PRIu64"", total);
		//fprintf(out, "%zu\n", size);

		if(size != max_size)
			fprintf(out, "\n");
	}
	free(dest);
	free(src);
	//free(file_data_in);
	fclose(out);
	return 0;
}
