#include <stdio.h>
#include <stdlib.h>
#include <math.h>
//#include <sys/time.h>
#include <time.h>
//#include <unistd.h>
//#include <sys/types.h>
typedef unsigned char bool;

void usage(void){
	printf("./lat <max memory size MB>\n");
}

//struct timespec ts;

#if defined(__i386__)

static __inline__ unsigned long long read_rdtsc(void)
{
  unsigned long long int x;
     __asm__ ("cpuid\n");
     __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
     return x;
}
#elif defined(__x86_64__)


static __inline__ unsigned long long read_rdtsc(void)
{
  unsigned hi, lo;
  __asm__ ("cpuid\n");
  __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
  return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}

#elif defined(__powerpc__)


static __inline__ unsigned long long read_rdtsc(void)
{
  unsigned long long int result=0;
  unsigned long int upper, lower,tmp;
  __asm__ ("cpuid\n");
  __asm__ volatile(
                "0:                  \n"
                "\tmftbu   %0           \n"
                "\tmftb    %1           \n"
                "\tmftbu   %2           \n"
                "\tcmpw    %2,%0        \n"
                "\tbne     0b         \n"
                : "=r"(upper),"=r"(lower),"=r"(tmp)
                );
  result = upper;
  result = result<<32;
  result = result|lower;

  return(result);
}

#endif



/*
static inline unsigned long long read_rdtsc(void)
{
unsigned long long d;
__asm__ __volatile__ ("rdtsc" : "=A" (d) );
return d;
}
*/

static inline void cpuid(void){
	__asm__ ("cpuid\n");
}


int main(int argc, char **argv){
	char *memory;
	char *mem_ptr;
	unsigned int max_memory_bytes = 0;
	unsigned int cur_memory_bytes = 1024;
	//struct timeval starttime, endtime;
	bool value;
	srand ( time(NULL) );
	struct timespec ts;
	long double te0, te1, te2, te3, te4, te5;
	long double calc1, calc2;
	long long base;
	struct timespec starttime, endtime;
	int ret_val, ret_val2;

	//double last_te0, last_te1, last_te2;
	unsigned int increment_offset = 0;
	unsigned int increment_bytes = 4096;
	unsigned char increment_count = 0;
	double CPU_MHZ = 1;

	unsigned long long first_clock;
	unsigned long long last_clock;

	if(argc != 2){
		usage();
		exit(1);
	}
	max_memory_bytes = atoi(argv[1])*1024*1024;

#ifdef __linux__
	FILE* fp;
	char buf[256];
	char mhz_buf[16];
	int count = 0;
	int found = 0;

        if( ( fp = fopen( "/proc/cpuinfo", "r" ) ) == NULL ) {
                fprintf( stderr, "Error opening /proc/cpuinfo !\n" );
                exit( 1 );
        }
	//cpu MHz		: 3208.243
        while( fgets(buf, sizeof(buf), fp) != NULL)
        {
		if (buf[count] == 'c'){
			if (buf[count+1] == 'p'){
				if (buf[count+4] == 'M'){
					if (buf[count+5] == 'H'){
						for(count=0; count<16; count++){
							mhz_buf[count]='\0';
						}
						found=1;
						//starting at 10 untill end
						for(count = 10; buf[count] != '\n'; count++){
							mhz_buf[count-10]=buf[count];
						}
					}
				}
			}
		}
	}
	if(found){
		CPU_MHZ=atof(mhz_buf);
	}
	fclose( fp );
	//free(buf);
	//free(mhz_buf);
	printf("CPU frequency: %.02lf\n", CPU_MHZ);
	
#endif
	if(CPU_MHZ == 1){
		printf("Warning: your platform is not supported yet, numbers will be inaccurate!\n");
		CPU_MHZ = 3000;
	}

	printf("Write Latencies(cpu cycles):\n");
	printf("\tsize\t|\t\tstride(bytes)\n");	

	printf("\t(KB)");
	printf("\t| 4");
	printf("\t  8");
	printf("\t  16");
	printf("\t  32");
	printf("\t  64");
	printf("\t  128\n");

	//memory = malloc(cur_memory_bytes);
	//memory = malloc(max_memory_bytes);

	//last_te0 = 0;
	//last_te1 = 0;
	//last_te2 = 0;
	//te0 = 1;
	//te1 = 1;
	//te2 = 1;
	memory = malloc(512);
	first_clock = read_rdtsc();
	sleep(1);
	base = (long long)(read_rdtsc() - first_clock);

	printf("estimated cpu freq: %luMHz\n", base/1000/1000);

	while(cur_memory_bytes < max_memory_bytes){

		memory = realloc(memory, cur_memory_bytes);
		ts.tv_sec = 0;
		ts.tv_nsec = 1;

	if(rand() % 50 < 25){	value = 1; }else{  value = 0; }
        first_clock = read_rdtsc();
	ret_val2 = Four_Byte_Walk(memory, 0, value);
	cpuid();
	base = (long long)(read_rdtsc() - first_clock);
//	printf("ret: %d\n", ret_val2);

	//printf("estimated cpu freq: %luMHz\n", base/1024/1024);
		 
		//prime
		if(rand() % 50 < 25){   value = 1; }else{  value = 0; }
		
		ret_val2 = Four_Byte_Walk(memory, cur_memory_bytes, value);
//printf("ret: %d\n", ret_val2);
/*
		Four_Byte_Time(memory, cur_memory_bytes);
		Four_Byte_Time(memory, cur_memory_bytes);
		Four_Byte_Time(memory, cur_memory_bytes);
		Four_Byte_Time(memory, cur_memory_bytes);
*/


//		ret_val=clock_gettime(CLOCK_MONOTONIC, &starttime);

if(rand() % 50 < 25){   value = 1; }else{  value = 0; }
first_clock = read_rdtsc();

		//gettimeofday(&starttime, NULL);		
		
ret_val2 = Four_Byte_Walk(memory, cur_memory_bytes, value);
//Four_Byte_Time(memory, cur_memory_bytes);
//nanosleep (&ts, NULL);
//Four_Byte_Time(memory, cur_memory_bytes);
//nanosleep (&ts, NULL);
//Four_Byte_Time(memory, cur_memory_bytes);
//nanosleep (&ts, NULL);
//Four_Byte_Time(memory, cur_memory_bytes);
cpuid();
te0 = (long double)(read_rdtsc() - first_clock - base);
printf("ret: %d value: %d mem1: %d\n", ret_val2, (int)value, (int)memory[0]);
//		ret_val=clock_gettime(CLOCK_MONOTONIC, &endtime);
//		te0=((double)(endtime.tv_sec*1000000000-starttime.tv_sec*1000000000+endtime.tv_nsec-starttime.tv_nsec));

//last_clock -= first_clock;

        first_clock = read_rdtsc();
        Eight_Byte_Walk(memory, 0, 1);
        cpuid();
        base = (long long)(read_rdtsc() - first_clock);



		Eight_Byte_Walk(memory, cur_memory_bytes, 160);
		first_clock = read_rdtsc();
		//ret_val=clock_gettime(CLOCK_MONOTONIC, &starttime);
		//gettimeofday(&starttime, NULL);
		Eight_Byte_Walk(memory, cur_memory_bytes, 32);
		//ret_val=clock_gettime(CLOCK_MONOTONIC, &endtime);
		//gettimeofday(&endtime, NULL);
		//te1=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
		//te1 = (double)(end-start)/CLOCKS_PER_SEC;
		//te1 = ((double)(endtime.tv_sec*1000000000-starttime.tv_sec*1000000000+endtime.tv_nsec-starttime.tv_nsec));
cpuid();
		te1 = (long double)(read_rdtsc() - first_clock - base);


        first_clock = read_rdtsc();
        Sixteen_Byte_Walk(memory, 0, 1);
        cpuid();
        base = (long long)(read_rdtsc() - first_clock);



		Sixteen_Byte_Walk(memory, cur_memory_bytes, 320);
		first_clock = read_rdtsc();
		//ret_val=clock_gettime(CLOCK_MONOTONIC, &starttime);
		//gettimeofday(&starttime, NULL);
		Sixteen_Byte_Walk(memory, cur_memory_bytes, 64);	
		//ret_val=clock_gettime(CLOCK_MONOTONIC, &endtime);
		//gettimeofday(&endtime, NULL);
		//te2=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
		//te2 = (double)(end-start)/CLOCKS_PER_SEC;
		//te2 = ((double)(endtime.tv_sec*1000000000-starttime.tv_sec*1000000000+endtime.tv_nsec-starttime.tv_nsec));
cpuid();
		te2 = (long double)(read_rdtsc() - first_clock - base);


        first_clock = read_rdtsc();
        Thirtytwo_Byte_Walk(memory, 0, 1);
        cpuid();
        base = (long long)(read_rdtsc() - first_clock);


		Thirtytwo_Byte_Walk(memory, cur_memory_bytes, 640);
		first_clock = read_rdtsc();
		//ret_val=clock_gettime(CLOCK_MONOTONIC, &starttime);
		Thirtytwo_Byte_Walk(memory, cur_memory_bytes, 128);
		//ret_val=clock_gettime(CLOCK_MONOTONIC, &endtime);
		//te3 = ((double)(endtime.tv_sec*1000000000-starttime.tv_sec*1000000000+endtime.tv_nsec-starttime.tv_nsec));
cpuid();
		te3 = (long double)(read_rdtsc() - first_clock - base);

        first_clock = read_rdtsc();
        Sixtyfour_Byte_Walk(memory, 0, 1);
        cpuid();
        base = (long long)(read_rdtsc() - first_clock);


		Sixtyfour_Byte_Walk(memory, cur_memory_bytes, 1280);
		//ret_val=clock_gettime(CLOCK_MONOTONIC, &starttime);
		first_clock = read_rdtsc();
		Sixtyfour_Byte_Walk(memory, cur_memory_bytes, 256);
		//ret_val=clock_gettime(CLOCK_MONOTONIC, &endtime);
		//te4 = ((double)(endtime.tv_sec*1000000000-starttime.tv_sec*1000000000+endtime.tv_nsec-starttime.tv_nsec));
cpuid();
		te4 = (long double)(read_rdtsc() - first_clock - base);



        first_clock = read_rdtsc();
        Onetwentyeight_Byte_Walk(memory, 0, 1);
        cpuid();
        base = (long long)(read_rdtsc() - first_clock);


		Onetwentyeight_Byte_Walk(memory, cur_memory_bytes, 2560);
		//ret_val=clock_gettime(CLOCK_MONOTONIC, &starttime);
		first_clock = read_rdtsc();
		Onetwentyeight_Byte_Walk(memory, cur_memory_bytes, 512);
		//ret_val=clock_gettime(CLOCK_MONOTONIC, &endtime);
		//te5 = ((double)(endtime.tv_sec*1000000000-starttime.tv_sec*1000000000+endtime.tv_nsec-starttime.tv_nsec));
cpuid();
		te5 = (long double)(read_rdtsc() - first_clock - base);

/*
//		te0-=3;
//		te0=CPU_MHZ*(1000*1000*(te0/1000000000));
		te1=CPU_MHZ*(1000*1000*(te1/1000000000));
		te2=CPU_MHZ*(1000*1000*(te2/1000000000));
		te3=CPU_MHZ*(1000*1000*(te3/1000000000));
		te4=CPU_MHZ*(1000*1000*(te4/1000000000));
		te5=CPU_MHZ*(1000*1000*(te5/1000000000));
*/
		printf("\t%.00lf", (double)cur_memory_bytes/1024 );

		//printf("\t %.02lf", (double)last_clock/(cur_memory_bytes/4) );
		
		calc2 = (long double)ret_val2*15/4;
		calc1 = (long double)(te0)/calc2;
		printf("\t %.02lf", (double)calc1 );
		calc2 = (long double)cur_memory_bytes;
		calc1 = (long double)(te1)/calc2;
		printf("\t %.02lf", (double)calc1 );
		calc1 = (long double)(te2)/calc2;
		printf("\t %.02lf", (double)calc1 );
		calc1 = (long double)(te3-base)/calc2;
		printf("\t %.02lf", (double)calc1 );
		calc1 = (long double)(te4)/calc2;
		printf("\t %.02lf", (double)calc1 );
		calc1 = (long double)(te5)/calc2;
		printf("\t %.02lf\n", (double)calc1 );

		//3200 clocks
		//(cur_memory_bytes/8)*2*10000 operations
		//clocks/operation

		//printf("\t%luKB clocks: %lf\n", cur_memory_bytes/1024, te0/(double)(cur_memory_bytes*100/8) );

		//1 4 8 12 16 
		//24 32 40 48 
		//64 80 96 128
		//160 192 224 256
		//320 384 448 512
		//640 768 896 1024
		//1280 1536 1792 2048
		//2560 ...

		increment_count++;
		if(cur_memory_bytes == 1024){
			cur_memory_bytes = 4096;
		}else{
			
			cur_memory_bytes += increment_bytes+increment_offset;

			//must be a multiple of 256
			//while(cur_memory_bytes%256 != 0)
			//	cur_memory_bytes++;
		}
		if(increment_count == 4){
			increment_count = 0;
			//increment_bytes *= 2;
			increment_bytes = floor(increment_bytes*2);
			increment_offset = increment_bytes;

			while(increment_offset%2048 != 0)
				increment_offset++;
			increment_offset -= increment_bytes;
		}
		//memory=realloc(memory, cur_memory_bytes);
		//free(memory);
		//memory = malloc(cur_memory_bytes);
	}


	free(memory);

	return 0;
}
