#include <stdlib.h>
#include <sys/time.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <pthread.h>
#include <math.h>
//#include "bandwidth.h"

double *mem_vars;
double mem_portion;

//pthread_mutex_t    mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t    *mutex;
//pthread_mutex_t    mutex1;
//pthread_mutex_t    mutex2;
//pthread_mutex_t    mutex3;

struct timeval starttime,endtime;

void *MemWork(void *mem_arg);
void *FloatWork(void *null);

void usage(void){
        printf("Usage: bench [threads] [max mem MB]\n");
        printf("        -h: this help menu\n");
	printf("\n");
}


int main(int argc, char **argv) {
	
	double te0, memtotal;
	int cores;
	int set_size;
int i, b, a, a3, t, s;
int counter1, counter2, counter3;
	
	int c;

	if(argc != 3){
		usage();
		exit(1);
	}else{
		for(c=0; c<argc; c++){
			if( !strcmp(argv[c], "-h") || !strcmp(argv[c], "-H") ){
				usage();
				exit(0);
			}
		}

		cores = atoi(argv[1]);
		memtotal = atof(argv[2]);
		//set_size = atoi(argv[3]);
		if(memtotal == 0 || cores == 0){
			usage();
			exit(1);
		}

		printf("\nRunning %d threads\n", cores);
		printf("Using %.02lfMB of mem\n", memtotal);
		//printf("set size %d\n\n", set_size);
	}
	




	int                   rc=0;
	pthread_mutexattr_t   mta;

	float *z3;
	z3 = (float *)malloc( 8*sizeof(float) );

	pthread_t thread[cores];
	if(cores != 1){
		rc = pthread_mutexattr_init(&mta);
		mutex = (pthread_mutex_t *)malloc(cores * sizeof(pthread_mutex_t) );
		//mutex[0] = PTHREAD_MUTEX_INITIALIZER;
		for(counter1 = 0; counter1 < cores; counter1++){
			if(counter1 == cores-1){
				rc = pthread_mutex_init(&mutex[counter1], &mta);	
			}else{
				rc = pthread_mutex_init(&mutex[counter1], NULL);
			}
			//rc = pthread_mutex_init(&mutex3, NULL);
			//rc = pthread_mutex_init(&mutex2, &mta);
		}
	}
	//membandwidth();






	printf("Running Floating Point Benchmarks\n\n");

        printf("\tSingle Threaded Performance:\n");

        for(b=0; b<8; b++){
		
                srand ( b );
                z3[b]=rand();
		while(z3[b] == 0){
			z3[b]=rand();
		}
	
        }



        gettimeofday(&starttime, NULL);
	//6 billion ops

	//counter1 = 5000000000/set_size/4;

	for(a=0; a<125000000; a++){

        //for(a=0; a<610351; a++)
	//{
        	//for(b=0; b<set_size; b++)
		//{
/*
			//z3[b]=z3[b]+z3[b]-z3[b]/z3[b]*z3[b];
			z3[0]=z3[1]+z3[2]*z3[3]+z3[7]*z3[6]+z3[5]*z3[4];
			z3[1]=z3[2]*z3[3]+z3[0]*z3[6]+z3[5]*z3[4]+z3[7];
			z3[2]=z3[3]-z3[0]/z3[1]-z3[5]/z3[4]-z3[7]/z3[6];
			z3[3]=z3[0]/z3[1]-z3[2]/z3[4]-z3[7]/z3[6]-z3[5];
                        z3[0]=z3[1]-z3[2]*z3[3]-z3[7]*z3[6]-z3[5]*z3[4];
                        z3[1]=z3[2]*z3[3]-z3[0]*z3[6]-z3[5]*z3[4]-z3[7];
                        z3[2]=z3[3]+z3[0]/z3[1]+z3[5]/z3[4]+z3[7]/z3[6];
                        z3[3]=z3[0]/z3[1]+z3[2]/z3[4]+z3[7]/z3[6]+z3[5];
*/

                        z3[0]=z3[1]+z3[2]*z3[3]+z3[7]*z3[6]+z3[5]*z3[4];
                        z3[1]=z3[2]+z3[3]*z3[0]+z3[6]*z3[5]+z3[4]*z3[7];
                        z3[2]=z3[3]+z3[0]*z3[1]+z3[5]*z3[4]+z3[7]*z3[6];
                        z3[3]=z3[0]+z3[1]*z3[2]+z3[4]*z3[7]+z3[6]*z3[5];
                        z3[0]=z3[1]+z3[2]*z3[3]+z3[7]*z3[6]+z3[5]*z3[4];
                        z3[1]=z3[2]+z3[3]*z3[0]+z3[6]*z3[5]+z3[4]*z3[7];
                        z3[2]=z3[3]+z3[0]*z3[1]+z3[5]*z3[4]+z3[7]*z3[6];
                        z3[3]=z3[0]+z3[1]*z3[2]+z3[4]*z3[7]+z3[6]*z3[5];



			//need 64
			//for(counter1 = 0; counter1 < inst_width; counter1++)
			//{
			//	z3[b]+=z3[b];
			//	z3[b]/=z3[b];
			//	z3[b]*=z3[b];
			//	z3[b]-=z3[b];
			//}
			//+z3[b]-z3[b]+z3[b]*z3[b];


			/*
			z3[b]=z3[b]+z3[b]-z3[b]+z3[b]*z3[b]
			+z3[b]-z3[b]+z3[b]*z3[b]
			*z3[b]-z3[b]*z3[b]*z3[b]
			*z3[b]-z3[b]*z3[b]*z3[b]
			*z3[b]-z3[b]+z3[b]+z3[b]
			*z3[b]-z3[b]+z3[b]+z3[b]
			+z3[b]-z3[b]*z3[b]+z3[b]
			+z3[b]-z3[b]*z3[b]+z3[b]
			*z3[b]-z3[b]+z3[b]-z3[b]
			*z3[b]-z3[b]+z3[b]-z3[b]
			+z3[b]-z3[b]*z3[b]-z3[b]
			+z3[b]-z3[b]*z3[b]-z3[b]
			+z3[b]-z3[b]+z3[b]-z3[b]
			+z3[b]-z3[b]+z3[b]-z3[b]
			*z3[b]-z3[b]*z3[b]-z3[b]
			*z3[b]-z3[b]*z3[b]-z3[b];
			*/
                //}
        }
        gettimeofday(&endtime, NULL);
        te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
//printf("time: %lf\n", te0);
	te0 = 60 /te0;
	printf("\t\tGFLOPS: %lf\n\n", te0);







//	if(cores == 1){
//		printf("Done\n");	
//		return 0;
//	}

        printf("\tMulti-Threaded Performance:\n");
        pthread_attr_t attr3;
        pthread_attr_init(&attr3);
        pthread_attr_setdetachstate(&attr3, PTHREAD_CREATE_JOINABLE);

	gettimeofday(&starttime, NULL);

                for(t=0; t<cores; t++)
                {			
                        a3 = pthread_create(&thread[t], &attr3, FloatWork, NULL);
                }
		pthread_attr_destroy(&attr3);
        	for(t=0; t<cores; t++)
        	{
                	a3 = pthread_join(thread[t], NULL);
        	}


        gettimeofday(&endtime, NULL);
        te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
	
        double GFLOPS = 60 *cores / te0;
        printf("\t\tGFLOPS: %lf\n\n", GFLOPS);
	//printf("time: %lf\n", te0);
	printf("\tdone.\n\n");


	mem_portion = memtotal/cores;
	//void *mem_arg = &mem_portion;
        mem_vars = (double *)malloc( cores*sizeof(double) );
	//double mem_time;
	double mem_time;


        gettimeofday(&starttime, NULL);
	for(b=0; b<4; b++)
	{	
                for(t=0; t<cores; t++)
                {
                        a3 = pthread_create(&thread[t], &attr3, MemWork, &t);
                }
                pthread_attr_destroy(&attr3);
                for(t=0; t<cores; t++)
                {
                        a3 = pthread_join(thread[t], NULL);
			//mem_time = *(double *)mem_timeptr;
			//printf("t: %d time: %lf\n", t, mem_time);
                }
	}
        gettimeofday(&endtime, NULL);
        te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
        //te0 = 50*cores /te0;
	printf("\tMemory bandwidth\n");
        printf("\t\tGB/s: %lf\n\n", memtotal*4/te0);
        //printf("\tdone.\n\n");

	//double total_rate = 0;
	//for(t=0; t<cores; t++)
	//{
	//	total_rate +=mem_vars[t];
	//	printf("core: %d  %lfMB/s\n", t, mem_vars[t]);
	//}
	//printf("\ntotal: %lfMB/s\n", total_rate);

	for(counter1=0; counter1 < cores; counter1++){
		pthread_mutex_destroy(&mutex[counter1]);
	}
	//pthread_mutex_destroy(&mutex);
	//pthread_mutex_destroy(&mutex1);
	//pthread_mutex_destroy(&mutex2);
	//pthread_mutex_destroy(&mutex3);

	printf("Done\n");
return 0;
}

void *MemWork(void *mem_arg)
{
        //struct timeval starttime,endtime;
	//int the_core = *(int *)mem_arg;
	void * retval;
	long memtotal = floor(mem_portion/2);
	int long_size=sizeof(long);
	long asize = memtotal * 1024 * 1024 / long_size;
	//long array_bytes=asize*long_size;
	long block_size;
	long *a;
	long *b;
	//long t;	


	//printf("core: %d mem: %lf\n", the_core, mem_portion);
	

	//gettimeofday(&starttime, NULL);
	//sleep(2);
        a=calloc(asize, long_size);
        b=calloc(asize, long_size);
	block_size = floor(memtotal * 1024 * 1024 / long_size);

	//gettimeofday(&starttime, NULL);
        //for(t=block_size; t<array_bytes; t+=block_size) {
                retval = memcpy(b,a,block_size);
		//retval = memcpy(a,b,block_size);
		//retval = memcpy(b,a,block_size);
		//retval = memcpy(a,b,block_size);
        //}
        //gettimeofday(&endtime, NULL);
	//mem_vars[the_core]=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
        //long t2 = 0;
        //for(t=block_size; t<array_bytes; t+=block_size) {
        //       t2+=t;
        //}

	//t2*=2;
        //test1 = (t2/1024/1024)/te;
        //printf ( "\t\tResult: %lfMB/s\n\n", test1*2);


        //mem_vars[the_core]=(t2/1024/1024)/mem_vars[the_core];

	//printf("here\n");
pthread_exit((void *) 0);
}



void *FloatWork(void *null)
{
	long w, b;
	float z3[8];
        for(b=0; b<8; b++){
                srand ( b );
                z3[b]=rand();
                while(z3[b] == 0){
                        z3[b]=rand();
                }

        }

//sleep(1);
//printf("here\n");

	for(w=0; w<125000000; w++){

/*                        z3[0]=z3[1]+z3[2]*z3[3]+z3[7]*z3[6]+z3[5]*z3[4];
                        z3[1]=z3[2]*z3[3]+z3[0]*z3[6]+z3[5]*z3[4]+z3[7];
                        z3[2]=z3[3]-z3[0]/z3[1]-z3[5]/z3[4]-z3[7]/z3[6];
                        z3[3]=z3[0]/z3[1]-z3[2]/z3[4]-z3[7]/z3[6]-z3[5];
                        z3[0]=z3[1]-z3[2]*z3[3]-z3[7]*z3[6]-z3[5]*z3[4];
                        z3[1]=z3[2]*z3[3]-z3[0]*z3[6]-z3[5]*z3[4]-z3[7];
                        z3[2]=z3[3]+z3[0]/z3[1]+z3[5]/z3[4]+z3[7]/z3[6];
                        z3[3]=z3[0]/z3[1]+z3[2]/z3[4]+z3[7]/z3[6]+z3[5];
*/

                        z3[0]=z3[1]+z3[2]*z3[3]+z3[7]*z3[6]+z3[5]*z3[4];
                        z3[1]=z3[2]+z3[3]*z3[0]+z3[6]*z3[5]+z3[4]*z3[7];
                        z3[2]=z3[3]+z3[0]*z3[1]+z3[5]*z3[4]+z3[7]*z3[6];
                        z3[3]=z3[0]+z3[1]*z3[2]+z3[4]*z3[7]+z3[6]*z3[5];
                        z3[0]=z3[1]+z3[2]*z3[3]+z3[7]*z3[6]+z3[5]*z3[4];
                        z3[1]=z3[2]+z3[3]*z3[0]+z3[6]*z3[5]+z3[4]*z3[7];
                        z3[2]=z3[3]+z3[0]*z3[1]+z3[5]*z3[4]+z3[7]*z3[6];
                        z3[3]=z3[0]+z3[1]*z3[2]+z3[4]*z3[7]+z3[6]*z3[5];


	}

/*
        for(w=0; w<312500000; w++){

                        z2[0]=z2[1]+z2[2]*z2[3];
                        z2[1]=z2[2]*z2[3]+z2[0];
                        z2[2]=z2[3]-z2[0]/z2[1];
                        z2[3]=z2[0]/z2[1]-z2[2];
                        z2[0]=z2[1]-z2[2]*z2[3];
                        z2[1]=z2[2]*z2[3]-z2[0];
                        z2[2]=z2[3]+z2[0]/z2[1];
                        z2[3]=z2[0]/z2[1]+z2[2];

	}
*/
   pthread_exit((void *) 0);
}
