#include <stdlib.h>
#include <sys/time.h>
#include <stdio.h>
#include <string.h>

#include <pthread.h>
#include "bandwidth.h"

int cores;
int set_size;

struct timeval starttime,endtime;
double te0;
int i, b, a, a3, t, s;
int counter1, counter2, counter3;

//pthread_mutex_t    mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t    *mutex;
//pthread_mutex_t    mutex1;
//pthread_mutex_t    mutex2;
//pthread_mutex_t    mutex3;


void *FloatWork(void *null);

void usage(void){
        printf("Usage: bench [threads] [max mem MB]\n");
        printf("        -h: this help menu\n");
	printf("\n");
}


int main(int argc, char **argv) {
	int c;

	if(argc != 3){
		usage();
		exit(1);
	}else{
		for(c=0; c<argc; c++){
			if( !strcmp(argv[c], "-h") || !strcmp(argv[c], "-H") ){
				usage();
				exit(0);
			}
		}

		cores = atoi(argv[1]);
		memtotal = atof(argv[2]);
		//set_size = atoi(argv[3]);
		if(memtotal == 0 || cores == 0){
			usage();
			exit(1);
		}

		printf("\nRunning %d threads\n", cores);
		printf("Using %.02lfMB of mem\n", memtotal);
		//printf("set size %d\n\n", set_size);
	}
	


	int                   rc=0;
	pthread_mutexattr_t   mta;

	//float *z3;
	//z3 = (float *)malloc( set_size*sizeof(float) );
	float z3[128];

	pthread_t thread[cores];
	if(cores != 1)
	{
		rc = pthread_mutexattr_init(&mta);
		mutex = (pthread_mutex_t *)malloc(cores * sizeof(pthread_mutex_t) );
		//mutex[0] = PTHREAD_MUTEX_INITIALIZER;
		for(counter1 = 1; counter1 < cores; counter1++){
			if(counter1 == cores-1)
				rc = pthread_mutex_init(&mutex[counter1], &mta);	
			else
				rc = pthread_mutex_init(&mutex[counter1], NULL);
			//rc = pthread_mutex_init(&mutex3, NULL);
			//rc = pthread_mutex_init(&mutex2, &mta);
		}
	}
	membandwidth();

	printf("Running Floating Point Benchmarks\n\n");

        printf("\tSingle Threaded Performance:\n");

        for(b=0; b<128; b++){
		
                srand ( b );
                z3[b]=rand();
		while(z3[b] == 0){
			z3[b]=rand();
		}
	
        }



        gettimeofday(&starttime, NULL);
	//5 billion ops

	//counter1 = 5000000000/set_size/4;

	//for(a=0; a<counter1; a++){

        for(a=0; a<610351; a++)
	{
        	for(b=0; b<128; b++)
		{
			//z3[b]=z3[b]+z3[b]-z3[b]/z3[b]*z3[b];

			//need 64
			//for(counter1 = 0; counter1 < inst_width; counter1++)
			//{
			//	z3[b]+=z3[b];
			//	z3[b]/=z3[b];
			//	z3[b]*=z3[b];
			//	z3[b]-=z3[b];
			//}
			//+z3[b]-z3[b]+z3[b]*z3[b];


			
			z3[b]=z3[b]+z3[b]-z3[b]+z3[b]*z3[b]
			+z3[b]-z3[b]+z3[b]*z3[b]
			*z3[b]-z3[b]*z3[b]*z3[b]
			*z3[b]-z3[b]*z3[b]*z3[b]
			*z3[b]-z3[b]+z3[b]+z3[b]
			*z3[b]-z3[b]+z3[b]+z3[b]
			+z3[b]-z3[b]*z3[b]+z3[b]
			+z3[b]-z3[b]*z3[b]+z3[b]
			*z3[b]-z3[b]+z3[b]-z3[b]
			*z3[b]-z3[b]+z3[b]-z3[b]
			+z3[b]-z3[b]*z3[b]-z3[b]
			+z3[b]-z3[b]*z3[b]-z3[b]
			+z3[b]-z3[b]+z3[b]-z3[b]
			+z3[b]-z3[b]+z3[b]-z3[b]
			*z3[b]-z3[b]*z3[b]-z3[b]
			*z3[b]-z3[b]*z3[b]-z3[b];
			
                }
        }
        gettimeofday(&endtime, NULL);
        te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
	te0 = 50 /te0;
	printf("\t\tGFLOPS: %lf\n\n", te0);







	if(cores == 1){
		printf("Done\n");	
		return 0;
	}

        printf("\tMulti-Threaded Performance:\n");
        pthread_attr_t attr3;
        pthread_attr_init(&attr3);
        pthread_attr_setdetachstate(&attr3, PTHREAD_CREATE_JOINABLE);

	gettimeofday(&starttime, NULL);
                for(t=0; t<cores; t++)
                {			
                        a3 = pthread_create(&thread[t], &attr3, FloatWork, NULL);
                }
		pthread_attr_destroy(&attr3);
        	for(t=0; t<cores; t++)
        	{
                	a3 = pthread_join(thread[t], NULL);
        	}

        gettimeofday(&endtime, NULL);
        te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
        te0 = 50*cores /te0;
        printf("\t\tGFLOPS: %lf\n\n", te0);
	printf("\tdone.\n\n");


	for(counter1=0; counter1 < cores; counter1++){
		pthread_mutex_destroy(&mutex[counter1]);
	}
	//pthread_mutex_destroy(&mutex);
	//pthread_mutex_destroy(&mutex1);
	//pthread_mutex_destroy(&mutex2);
	//pthread_mutex_destroy(&mutex3);

	printf("Done\n");
return 0;
}

void *FloatWork(void *null)
{

	int w, y;
	float z2[128];
        for(b=0; b<128; b++){
                srand ( b );
                z2[b]=rand();
        }

        for(w=0; w<610351; w++)
        {
                for(y=0; y<128; y++)
                {

                        z2[y]=z2[y]+z2[y]-z2[y]+z2[y]*z2[y]
			+z2[y]-z2[y]+z2[y]*z2[y]
			*z2[y]-z2[y]*z2[y]*z2[y]
			*z2[y]-z2[y]*z2[y]*z2[y]
			*z2[y]-z2[y]+z2[y]+z2[y]
			*z2[y]-z2[y]+z2[y]+z2[y]
			+z2[y]-z2[y]*z2[y]+z2[y]
			+z2[y]-z2[y]*z2[y]+z2[y]
			*z2[y]-z2[y]+z2[y]-z2[y]
			*z2[y]-z2[y]+z2[y]-z2[y]
			+z2[y]-z2[y]*z2[y]-z2[y]
			+z2[y]-z2[y]*z2[y]-z2[y]
			+z2[y]-z2[y]+z2[y]-z2[y]
			+z2[y]-z2[y]+z2[y]-z2[y]
			*z2[y]-z2[y]*z2[y]-z2[y]
			*z2[y]-z2[y]*z2[y]-z2[y];
			
                }
        }

   pthread_exit((void *) 0);
}
