#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <unistd.h>
//#include <sys/types.h>
#define __STDC_FORMAT_MACROS 1
#include <inttypes.h>
#include <string.h>

#define LRAND(s) \
(((s) = (s) * 41943011 - 2147483647) >> 32)

#if defined(__i386__)

static __inline__ uint64_t rdtsc(void)
{
  uint64_t x;
     __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
     return x;
}
#elif defined(__x86_64__)


static __inline__ uint64_t rdtsc(void)
{
  uint32_t hi, lo;
  __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
  return ( (uint64_t)lo)|( ((uint64_t)hi)<<32 );
}

#elif defined(__powerpc__)


static __inline__ uint64_t rdtsc(void)
{
  uint64_t result=0;
  uint32_t upper, lower,tmp;
  __asm__ volatile(
                "0:                  \n"
                "\tmftbu   %0           \n"
                "\tmftb    %1           \n"
                "\tmftbu   %2           \n"
                "\tcmpw    %2,%0        \n"
                "\tbne     0b         \n"
                : "=r"(upper),"=r"(lower),"=r"(tmp)
                );
  result = upper;
  result = result<<32;
  result = result|lower;

  return(result);
}

#endif




/*
void memcpy1 ( void * destination, const void * source, size_t num ){
	size_t i;
	for(i=0; i<num; i++){
		*(((char *)destination)+i) = *(((char *)source)+i);
	}
}
*/
//void * memcpy2 ( void * destination, const void * source, size_t num );
//void * memcpy3 ( void * destination, const void * source, size_t num );




int main(int argc, char **argv){
	uint64_t clocks1, clocks2;
	uint64_t total;
	uint32_t num1, num2;
	size_t size = 33554432;
	void *dest;
	void *src;
	uint64_t rseed = rdtsc();

	for(size=1; size<66; size++){
	//for(size=1; size<32; size++){
//		printf("size: %zu", size);
		//dest = malloc(size);
		//src = malloc(size);
		total = 0;

		for(num1=0; num1<32; num1++){
			dest = malloc(size);
			src = malloc(size);
			//(void)memset( dest, (int)(LRAND(rseed) % 256), size);
			for(num2=0; num2<size; num2++)
				*(((char *)src)+num2) = (char)(LRAND(rseed) % 256);

			clocks1 = rdtsc();
			(void)__builtin_memcpy(dest, src, size);
			clocks2 = rdtsc();
			//builtin:
			total += clocks2-clocks1;
			free(dest);
			free(src);
		}
		total /= 32;

		//clocks1 = rdtsc();
		//(void)__builtin_memcpy(src, dest, size);
		//clocks2 = rdtsc();

		//total += clocks2-clocks1;
		//total /= 2;
		printf("%"PRIu64"", total);

/*		clocks1 = rdtsc();
		(void)__builtin_memset(dest, size, size);
		clocks2 = rdtsc();
		printf("%"PRIu64" \n", clocks2-clocks1);
*/


/*
		clocks1 = rdtsc();
		(void)memcpy(src, dest, size);
		clocks2 = rdtsc();

		printf(" libc: %"PRIu64" ", clocks2-clocks1);

		clocks1 = rdtsc();
		(void)memset(src, size, size);
		clocks2 = rdtsc();
		printf("%"PRIu64" \n", clocks2-clocks1);
*/
		if(size!=65)
			printf("\n");
		//free(dest);
		//free(src);
	}
//	printf("\n");
	return 0;
}
