#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <pthread.h>
#include <time.h>
#include <sys/time.h>
#include <string.h>
#include <xmmintrin.h>

struct timeval starttime,endtime;

long *a;
long *b;
        float z0[4];
        float z1[4];
        //float z2[4]; 
        //float z3[4];


struct mem_chunk{
	int thread_id;
	long block_size;
	//long asize;
	long *a;
	long *b;
}*mem_table;

//struct float_data{
//	int thread_id;
//	float z3[8];
	//float z3[8];
//}*ft;

void *FloatWork(void *null);

void *MemWork1(void *mem_arg);
void *MemWork2(void *mem_arg);
void *MemWork3(void *mem_arg);
void *MemWork4(void *mem_arg);
void *MemWork5(void *mem_arg);

//void *FloatWork(void *null);

void usage(void){
        printf("Usage: lsbench [threads] [max mem MB]\n");
        printf("        -h: this help menu\n");
        printf("\n");
}

int main(int argc, char **argv){
	int thread_count;
	int memtotal;
	int memportion;
	int c, num1;
	int long_size;
	long asize;
	long block_size;
	double te0;

        if(argc != 3){
                usage();
                exit(1);
        }else{
                for(c=0; c<argc; c++){
                        if( !strcmp(argv[c], "-h") || !strcmp(argv[c], "-H") ){
                                usage();
                                exit(0);
                        }
                }
        
                thread_count = atoi(argv[1]);
                memtotal = atoi(argv[2]);
                if(memtotal == 0 || thread_count == 0){
                        usage();
                        exit(1);
                }
        
                printf("\nRunning %d threads\n", thread_count);
                printf("Using %dMB of mem\n", memtotal);
	}

	pthread_t *threads = (pthread_t *)malloc(sizeof(pthread_t) * thread_count);
	memportion = (int)(memtotal / 2 / thread_count );

	mem_table = (struct mem_chunk *)malloc( sizeof(struct mem_chunk) * thread_count);
        long_size=sizeof(long);
        asize = (long)(memtotal / 2 * 1024 * 1024 / long_size);
	block_size = (long)(memportion * 1024 * 1024);

	//for(num1=0; num1<thread_count; num1++){
	//a = calloc(asize, long_size);
	//b = calloc(asize, long_size);

	a = malloc(sizeof(long)*asize);
	b = malloc(sizeof(long)*asize);
	//put some random stuff in them
	srand ( time(NULL) );
	for(num1=0; num1<asize; num1++){
		a[num1] = (long)rand();
		b[num1] = (long)rand();
	}
	asize = (long)(memportion * 1024 * 1024 / long_size);

//printf("here 1\n");
			//mem_table.thread_id = (int *)malloc( sizeof(int)*thread_count);
	for(num1=0; num1<thread_count; num1++){	
			//mem_table[num1].asize = asize;
			mem_table[num1].thread_id = num1;
			mem_table[num1].block_size = block_size;
			mem_table[num1].a = &a[num1*asize];
			mem_table[num1].b = &b[num1*asize];
	}

	//printf("SIZE: %lu %lu\n", sizeof(a), sizeof(b));

	gettimeofday(&starttime, NULL);
//        for(runs=0; runs<4; runs++)
//        {  

		for(num1=0; num1<thread_count; num1++){
			//mem_table.thread_id[num1]=num1;
			if (pthread_create(&threads[num1], NULL, MemWork1, &mem_table[num1].thread_id) != 0)
				perror("pthread_create"), exit(1);
		}
	

		for(num1=0; num1<thread_count; num1++){
			if (pthread_join(threads[num1], NULL) != 0)
				perror("pthread_join"),exit(1);
		}

//	}
	gettimeofday(&endtime, NULL);
	te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
	printf("\tMemory bandwidth\n");
	// 6 times read+write
	printf("\t\tmemcpy (read+write):\n");
	printf("\t\tMB/s: %lf\n", (double)memtotal*6/te0);

	gettimeofday(&starttime, NULL);
	for(num1=0; num1<thread_count; num1++){
		if (pthread_create(&threads[num1], NULL, MemWork2, &mem_table[num1].thread_id) != 0)
			perror("pthread_create"), exit(1);
	}
	for(num1=0; num1<thread_count; num1++){
		if (pthread_join(threads[num1], NULL) != 0)
			perror("pthread_join"),exit(1);
	}
	gettimeofday(&endtime, NULL);
	te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;

	printf("\t\tmemset (write):\n");	
	printf("\t\tMB/s: %lf\n", (double)memtotal*6/te0);

	gettimeofday(&starttime, NULL);
	for(num1=0; num1<thread_count; num1++){
		if (pthread_create(&threads[num1], NULL, MemWork3, &mem_table[num1].thread_id) != 0)
			perror("pthread_create"), exit(1);
	}
	for(num1=0; num1<thread_count; num1++){
		if (pthread_join(threads[num1], NULL) != 0)
			perror("pthread_join"),exit(1);
	}
	gettimeofday(&endtime, NULL);
	te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;

	printf("\t\tmemmove (read+write):\n");
	printf("\t\tMB/s: %lf\n", (double)memtotal*6/te0);

	gettimeofday(&starttime, NULL);
	for(num1=0; num1<thread_count; num1++){
		if (pthread_create(&threads[num1], NULL, MemWork4, &mem_table[num1].thread_id) != 0)
			perror("pthread_create"), exit(1);
	}
	for(num1=0; num1<thread_count; num1++){
		if (pthread_join(threads[num1], NULL) != 0)
			perror("pthread_join"),exit(1);
	}
	gettimeofday(&endtime, NULL);
	te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;

	printf("\t\tloop assignment (read):\n");
	printf("\t\tMB/s: %lf\n", (double)memtotal*6/te0);

	gettimeofday(&starttime, NULL);
	for(num1=0; num1<thread_count; num1++){
		if (pthread_create(&threads[num1], NULL, MemWork4, &mem_table[num1].thread_id) != 0)
			perror("pthread_create"), exit(1);
	}
	for(num1=0; num1<thread_count; num1++){
		if (pthread_join(threads[num1], NULL) != 0)
			perror("pthread_join"),exit(1);
	}
	gettimeofday(&endtime, NULL);
	te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
	printf("\t\tloop assignment (write):\n");
	printf("\t\tMB/s: %lf\n\n", (double)memtotal*6/te0);

	free(a);
	free(b);

	//ft = (struct float_data *)malloc( sizeof(struct float_data) * thread_count);

        //float *z3;
        //z2 = (float *)malloc( sizeof(float)*thread_count );
	//for(num1=0; num1<thread_count; num1++){
	//	z3[num1]=(float *)malloc( 8*sizeof(float) );
	//	for(num2=0; num2<8; num2++){
	//		ft[num1].z3[num2]=&z3[num1][num2];
	//	}
	//}

	printf("Running Floating Point Benchmarks\n\n");
                
	printf("\tSingle Threaded Performance:\n");


        for(num1=0; num1<4; num1++){
                srand (time(NULL));
                z0[num1]=rand()+1;
                z1[num1]=rand()-1;
                //z1[num1]=rand()+1;
                //z0[num1]=rand()-1;
        }


	gettimeofday(&starttime, NULL);


/*
	for(num1=0; num1<8; num1++){     
		srand ( num1 );
		z2[num1]=rand();
		while(z2[num1] == 0){
			z2[num1]=rand();
		}
	}

	//gettimeofday(&starttime, NULL);
	//6 billion ops

	for(num1=0; num1<125000000; num1++){
                        z2[0]=z2[1]+z2[2]*z2[3]+z2[7]*z2[6]+z2[5]*z2[4];
                        z2[1]=z2[2]+z2[3]*z2[0]+z2[6]*z2[5]+z2[4]*z2[7];
                        z2[2]=z2[3]+z2[0]*z2[1]+z2[5]*z2[4]+z2[7]*z2[6];
                        z2[3]=z2[0]+z2[1]*z2[2]+z2[4]*z2[7]+z2[6]*z2[5];
                        z2[0]=z2[1]+z2[2]*z2[3]+z2[7]*z2[6]+z2[5]*z2[4];
                        z2[1]=z2[2]+z2[3]*z2[0]+z2[6]*z2[5]+z2[4]*z2[7];
                        z2[2]=z2[3]+z2[0]*z2[1]+z2[5]*z2[4]+z2[7]*z2[6];
                        z2[3]=z2[0]+z2[1]*z2[2]+z2[4]*z2[7]+z2[6]*z2[5];
	}
*/

	if (pthread_create(&threads[0], NULL, FloatWork, NULL) != 0)
		perror("pthread_create"), exit(1);
	if (pthread_join(threads[0], NULL) != 0)
		perror("pthread_join"),exit(1);
	

	gettimeofday(&endtime, NULL);
	te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
	te0 = 6/te0;
	printf("\t\tGFLOPS(estimated): %lf\n\n", te0);

	printf("\tMulti-Threaded Performance:\n");

	/*
        for(num1=0; num1<8; num1++){
                srand ( num1 );
		for(num2=0; num2<thread_count; num2++){
                	z3[num2][num1]=rand();
                	while(z3[num2][num1] == 0){
                        	z3[num2][num1]=rand();
                	}
		}
        }
	*/

        for(num1=0; num1<4; num1++){
                srand (time(NULL));
                z0[num1]=rand()+1;
                z1[num1]=rand()-1;
                //z1[num1]=rand()+1;
                //z0[num1]=rand()-1;
        }


	gettimeofday(&starttime, NULL);
        for(num1=0; num1<thread_count; num1++){
		//ft[num1].thread_id=num1;
                if (pthread_create(&threads[num1], NULL, FloatWork, NULL) != 0)
                        perror("pthread_create"), exit(1);
        }
        for(num1=0; num1<thread_count; num1++){
                if (pthread_join(threads[num1], NULL) != 0)
                        perror("pthread_join"),exit(1);
        }
        gettimeofday(&endtime, NULL);
        te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
	te0 = (float)6*thread_count/te0;
	printf("\t\tGFLOPS(estimated): %lf\n\n", te0);



return 0;
}

void *FloatWork(void *null)
{
	//int num1;
	//float z0[2];
	//float z1[2];
	//float z2[2];
	//float z3[2];
	//for(num1=0; num1<4; num1++){
	//	srand (time(NULL));
	//	z0[num1]=rand()+1;
	//	z1[num1]=rand()-1;
		//z1[num1]=rand()+1;
		//z0[num1]=rand()-1;
	//}

	//asm(".intel_syntax noprefix\n");
	//asm("movups xmm0, [z0]\n");
	//asm("movups xmm1, [z1]\n");
	//asm("mov ecx, 125000000\n");
	//asm("movupd xmm3, [z1]\n");


#ifdef OSX
        asm{
                movupd xmm0 z0
                movupd xmm1 z1
                mov ecx 125000000
                LOOP1:
                	addpd xmm0 xmm1
                	mulpd xmm1 xmm0
                	addpd xmm0 xmm1
			mulpd xmm1 xmm0
			addpd xmm0 xmm1
			mulpd xmm1 xmm0
			addpd xmm0 xmm1
			mulpd xmm1 xmm0
			addpd xmm0 xmm1
			mulpd xmm1 xmm0
			addpd xmm0 xmm1
			mulpd xmm1 xmm0
		dec ecx
		jnz LOOP1
        }


#elseif
		asm(".intel_syntax noprefix\n");
			asm("movups xmm0, [z0]\n");
			asm("movups xmm1, [z1]\n");
			asm("mov ecx, 125000000\n");
			asm("LOOP1:\n");
				asm("addps xmm0, xmm1\n");
				asm("mulps xmm1, xmm0\n");
                		asm("addps xmm0, xmm1\n");
                		asm("mulps xmm1, xmm0\n");
                		asm("addps xmm0, xmm1\n");
                		asm("mulps xmm1, xmm0\n");
                		asm("addps xmm0, xmm1\n");
                		asm("mulps xmm1, xmm0\n");
                		asm("addps xmm0, xmm1\n");
                		asm("mulps xmm1, xmm0\n");
                		asm("addps xmm0, xmm1\n");
                		asm("mulps xmm1, xmm0\n");
			asm("dec ecx\n");
			asm("jnz LOOP1\n");
#endif

	pthread_exit(0);
}


/*
void *FloatWork4(void *null)
{
	__m128 a;
	__m128 b;
        int w=0;
	float z0, z1, z2, z3, z4, z5, z6, z7;
	
        //float *z3;
	//z3 = (float *)malloc( 8*sizeof(float) );
	//srand (time(NULL));

        //for(w=0; w<8; w++){
                srand ( time(NULL) );
                z0=rand()+1;
		srand ( z0 );
		z1=rand()+1;
		srand ( z1 );
		z2=rand()+1;
		srand ( z2 );
		z3=rand()+1;
		srand ( z3 );
		z4=rand()+1;
		srand ( z4 );
		z5=rand()+1;
		srand ( z5 );
		z6=rand()+1;
		srand ( z6 );
		z7=rand()+1;


                //while(z3[b] == 0){
                //        z3[b]=rand();
                //}
                        
        //}

	//printf("here !\n");

	//ft[index].z3[num1]
	//int index = *(int *)f_arg;

	//for(int w=0; w<125000000; w++){


	a = _mm_setr_ps(z0, z1, z2, z3);
	b = _mm_setr_ps(z4, z5, z6, z7);

	while(w<125000000){
		a = _mm_add_ps(a, b);
		b = _mm_mul_ps(a, b);
                a = _mm_add_ps(a, b);   
                b = _mm_mul_ps(a, b);  
                a = _mm_add_ps(a, b);   
                b = _mm_mul_ps(a, b);  
                a = _mm_add_ps(a, b);   
                b = _mm_mul_ps(a, b);  
                a = _mm_add_ps(a, b);   
                b = _mm_mul_ps(a, b);  
                a = _mm_add_ps(a, b);   
                b = _mm_mul_ps(a, b);  
	

		w++;
		
                        z0=z1+z2*z3+z7*z6+z5*z4;
                        z1=z2+z3*z0+z6*z5+z4*z7;
                        z2=z3+z0*z1+z5*z4+z7*z6;
                        z3=z0+z1*z2+z4*z7+z6*z5;
                        z0=z1+z2*z3+z7*z6+z5*z4;
                        z1=z2+z3*z0+z6*z5+z4*z7;
                        z2=z3+z0*z1+z5*z4+z7*z6;
                        z3=z0+z1*z2+z4*z7+z6*z5;
			w++;
		
		ft[index].z3[0]=ft[index].z3[1]+ft[index].z3[2]*ft[index].z3[3]+ft[index].z3[7]*ft[index].z3[6]+ft[index].z3[5]*ft[index].z3[4];
		
		ft[index].z3[1]=ft[index].z3[2]+ft[index].z3[3]*ft[index].z3[0]+ft[index].z3[6]*ft[index].z3[5]+ft[index].z3[4]*ft[index].z37];
		
		ft[index].z3[2]=ft[index].z3[3]+ft[index].z3[0]*ft[index].z3[1]+ft[index].z3[5]*ft[index].z3[4]+ft[index].z3[7]*ft[index].z3[6];
		
		ft[index].z3[3]=ft[index].z3[0]+ft[index].z3[1]*ft[index].z3[2]+ft[index].z3[4]*ft[index].z3[7]+ft[index].z3[6]*ft[index].z3[5];
		
		ft[index].z3[0]=ft[index].z3[1]+ft[index].z3[2]*ft[index].z3[3]+ft[index].z3[7]*ft[index].z3[6]+ft[index].z3[5]*ft[index].z3[4];
		
		ft[index].z3[1]=ft[index].z3[2]+ft[index].z3[3]*ft[index].z3[0]+ft[index].z3[6]*ft[index].z3[5]+ft[index].z3[4]*ft[index].z3[7];
		
		ft[index].z3[2]=ft[index].z3[3]+ft[index].z3[0]*ft[index].z3[1]+ft[index].z3[5]*ft[index].z3[4]+ft[index].z3[7]*ft[index].z3[6];
		ft[index].z3[3]=ft[index].z3[0]+ft[index].z3[1]*ft[index].z3[2]+ft[index].z3[4]*ft[index].z3[7]+ft[index].z3[6]*ft[index].z3[5];
		
	}
printf("W: %d\n", w);
        pthread_exit(0);
}
*/



void *MemWork1(void *mem_arg)
{
	//printf("thread: %d\n", *(int *)mem_arg);
	//printf("blocksize: %d\n",   block_size );

	//printf("tid: %d copying %lu bytes\n", *(int *)mem_arg, mem_table[*(int *)mem_arg].block_size );

	(void)memcpy( mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
	(void)memcpy( mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].block_size);
	(void)memcpy( mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
	(void)memcpy( mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].block_size);
	(void)memcpy( mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
	(void)memcpy( mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].block_size);

	//printf("memtotal: %lu block_s: %lu\n", memtotal, block_size);
	pthread_exit(0);

}

void *MemWork2(void *mem_arg)
{
	(void)memset (mem_table[*(int *)mem_arg].b, 124, mem_table[*(int *)mem_arg].block_size);
	(void)memset (mem_table[*(int *)mem_arg].a, 122, mem_table[*(int *)mem_arg].block_size);

        (void)memset (mem_table[*(int *)mem_arg].b, 12, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].a, 18, mem_table[*(int *)mem_arg].block_size);

        (void)memset (mem_table[*(int *)mem_arg].b, 24, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].a, 22, mem_table[*(int *)mem_arg].block_size);

        (void)memset (mem_table[*(int *)mem_arg].b, 2, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].a, 4, mem_table[*(int *)mem_arg].block_size);

        (void)memset (mem_table[*(int *)mem_arg].b, 36, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].a, 48, mem_table[*(int *)mem_arg].block_size);

        (void)memset (mem_table[*(int *)mem_arg].b, 96, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].a, 87, mem_table[*(int *)mem_arg].block_size);


	pthread_exit(0);
}

void *MemWork3(void *mem_arg)
{
	(void)memmove( mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
	(void)memmove( mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].block_size);
	(void)memmove( mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
	(void)memmove( mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].block_size);
	(void)memmove( mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
	(void)memmove( mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].block_size);

	pthread_exit(0);
}

void *MemWork4(void *mem_arg)
{       
	//pray for register, no asm skills
	char c1[8];

	int num1, num2;
	int index=*(int *)mem_arg;
	int b_size=mem_table[index].block_size-8;
	char *a = (char *)&mem_table[index].a[0];	
	char *b = (char *)&mem_table[index].b[0];

	for(num2=0; num2<6; num2++){
		for(num1=0; num1<b_size; num1+=8){

			(void)memcpy( &c1[0], &a, 8);
			(void)memcpy( &c1[0], &b, 8);
			a+=8;
			b+=8;
/*			c1[0]=mem_table[index].a[num1];
			c1[1]=mem_table[index].a[num1];
			c1[2]=mem_table[index].a[num1];
			c1[3]=mem_table[index].a[num1];
			c1[4]=mem_table[index].a[num1];
			c1[5]=mem_table[index].a[num1];
			c1[6]=mem_table[index].a[num1];
			c1[7]=mem_table[index].a[num1];
		
			c1[0]=mem_table[index].b[num1];
			c1[1]=mem_table[index].b[num1];
			c1[2]=mem_table[index].b[num1];
			c1[3]=mem_table[index].b[num1];
			c1[4]=mem_table[index].b[num1];
			c1[5]=mem_table[index].b[num1];
			c1[6]=mem_table[index].b[num1];
			c1[7]=mem_table[index].b[num1];
*/
		//(void)memcpy( c1, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
		//(void)memcpy( c1, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
		}
	}
	pthread_exit(0);
}       


void *MemWork5(void *mem_arg)
{
        //pray for register, no asm skills
        char c1[]={22, 44, 11, 25, 36, 44, 69, 71};
                        
        int num1, num2;
        int index=*(int *)mem_arg;
        int b_size=mem_table[index].block_size-8;
        char *a = (char *)&mem_table[index].a[0];
        char *b = (char *)&mem_table[index].b[0];
                        
        for(num2=0; num2<6; num2++){
                for(num1=0; num1<b_size; num1+=8){
                        
                        (void)memcpy( &a, &c1[0], 8);  
                        (void)memcpy( &b, &c1[0], 8);  
                        a+=8;
                        b+=8;
                }
        }
        pthread_exit(0);
}

