/*
    lsbench is a benchmark project.
    Copyright (C) 2007-2010  sterling pickens

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#include "lsbench.h"

struct timeval starttime,endtime;
float *a;
float *b;
float z0[4];
float z1[4];

struct mem_chunk{
	int thread_id;
	long block_size;
	//long asize;
	float *a;
	float *b;
}*mem_table;

void *FloatWork(void *null);
void *MemWork1(void *mem_arg);
void *MemWork2(void *mem_arg);
void *MemWork3(void *mem_arg);

void lsbench(int thread_count, int verbose, size_t memtotal, double *results){
	//int thread_count;
	//size_t memtotal;
	size_t memportion;
	size_t num1;
	int c;
	int float_size;
	long asize;
	long block_size;
	//struct timeval starttime,endtime;
	double te0;

	/*
        if(argc != 3){
                usage();
                exit(1);
        }else{
                for(c=0; c<argc; c++){
                        if( !strcmp(argv[c], "-h") || !strcmp(argv[c], "-H") ){
                                usage();
                                exit(0);
                        }
                } 
                thread_count = atoi(argv[1]);
                memtotal = atoi(argv[2]);
                if(memtotal == 0 || thread_count == 0){
                        usage();
                        exit(1);
                }
     */
	if(verbose){
		printf("\nlsbench test version 03/04/2010 linuxsociety.org\n");
		printf("\tRunning %d threads\n", thread_count);
		printf("\tUsing %luMB of mem\n\n", memtotal);
	}

	pthread_t *threads = (pthread_t *)malloc(sizeof(pthread_t) * thread_count);
	memportion = (int)(memtotal / 2 / thread_count );

	mem_table = (struct mem_chunk *)malloc( sizeof(struct mem_chunk) * thread_count);
	float_size=sizeof(float);
	asize = (long)(memtotal / 2 * 1024 * 1024 / float_size);
	block_size = (long)(memportion * 1024 * 1024);

	a = malloc(sizeof(float)*asize);
	b = malloc(sizeof(float)*asize);

	//printf("sizeof a: %u\n", (unsigned int)(sizeof(float)*asize) );

	//put some random stuff in them
	srand ( time(NULL) );
	for(num1=0; num1<asize; num1++){
		a[num1] = (float)rand();
		b[num1] = (float)rand();
	}
	asize = (long)(memportion * 1024 * 1024 / float_size);
	for(num1=0; num1<thread_count; num1++){	
			//mem_table[num1].asize = asize;
			mem_table[num1].thread_id = num1;
			mem_table[num1].block_size = block_size;
			mem_table[num1].a = &a[num1*asize];
			mem_table[num1].b = &b[num1*asize];
	}


	gettimeofday(&starttime, NULL);
		for(num1=0; num1<thread_count; num1++){
			//mem_table.thread_id[num1]=num1;
			if (pthread_create(&threads[num1], NULL, MemWork1, &mem_table[num1].thread_id) != 0)
				perror("pthread_create"), exit(1);
		}

		for(num1=0; num1<thread_count; num1++){
			if (pthread_join(threads[num1], NULL) != 0)
				perror("pthread_join"),exit(1);
		}
	gettimeofday(&endtime, NULL);

	te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
	if(verbose){
		printf("Running Memory Benchmarks\n");
		// 6 times read+write
		printf("\tmemcpy  (read+write)        :");
		printf("\t%.03lfMB/s\n", (double)memtotal*6/te0);
	}
	results[0] += (double)memtotal*6/te0;

	gettimeofday(&starttime, NULL);
	for(num1=0; num1<thread_count; num1++){
		if (pthread_create(&threads[num1], NULL, MemWork2, &mem_table[num1].thread_id) != 0)
			perror("pthread_create"), exit(1);
	}
	for(num1=0; num1<thread_count; num1++){
		if (pthread_join(threads[num1], NULL) != 0)
			perror("pthread_join"),exit(1);
	}
	gettimeofday(&endtime, NULL);
	te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
	if(verbose){
		printf("\tmemset       (write)        :");	
		printf("\t%.03lfMB/s\n", (double)memtotal*6/te0);
	}
	results[1] += memtotal*6/te0;

	gettimeofday(&starttime, NULL);
	for(num1=0; num1<thread_count; num1++){
		if (pthread_create(&threads[num1], NULL, MemWork3, &mem_table[num1].thread_id) != 0)
			perror("pthread_create"), exit(1);
	}
	for(num1=0; num1<thread_count; num1++){
		if (pthread_join(threads[num1], NULL) != 0)
			perror("pthread_join"),exit(1);
	}
	gettimeofday(&endtime, NULL);
	te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
	if(verbose){
		printf("\tmemmove (read+write)        :");
		printf("\t%.03lfMB/s\n", (double)memtotal*6/te0);
	}
	results[2] += memtotal*6/te0;

	free(a);
	free(b);
	if(verbose){
		printf("\nRunning Floating Point Benchmarks\n");
		printf("\tSingle Threaded Performance :");
	}

	for(num1=0; num1<4; num1++){
		srand (time(NULL));
		z0[num1]=rand()+1;
		z1[num1]=rand()-1;
	}

	gettimeofday(&starttime, NULL);

	if(pthread_create(&threads[0], NULL, FloatWork, NULL) != 0)
		perror("pthread_create"), exit(1);
	if(pthread_join(threads[0], NULL) != 0)
		perror("pthread_join"),exit(1);

	gettimeofday(&endtime, NULL);
	te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
	te0 = 64/te0;

	if(verbose){
		printf("\t%.03lfGFLOPS\n", te0);
		printf("\tMulti-Threaded Performance  :");
	}
	results[3] += te0;

	for(num1=0; num1<4; num1++){
		srand (time(NULL));
		z0[num1]=rand()+1;
		z1[num1]=rand()-1;
	}

	gettimeofday(&starttime, NULL);
        for(num1=0; num1<thread_count; num1++){
                if(pthread_create(&threads[num1], NULL, FloatWork, NULL) != 0)
                        perror("pthread_create"), exit(1);
        }
        for(num1=0; num1<thread_count; num1++){
                if(pthread_join(threads[num1], NULL) != 0)
                        perror("pthread_join"),exit(1);
        }
        gettimeofday(&endtime, NULL);
        te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
	te0 = (float)64*thread_count/te0;
	if(verbose)
		printf("\t%.03lfGFLOPS\n\n", te0);
	results[4] += te0;

	free(mem_table);
	//free(a);
	//free(b);
	free(threads);
	//printf("here 1\n");
	//return 0;
}

void *FloatWork(void *null)
{

#ifdef OSX
        asm{
                movups xmm0 z0
                movups xmm1 z1
		movups xmm2 xmm1
		movups xmm3 xmm0
		movups xmm4 xmm1
		movups xmm5 xmm0
		movups xmm6 xmm1
		movups xmm7 xmm0
                mov ecx 4000000000
                LOOP1:
                	addps xmm0 xmm4
                	mulps xmm1 xmm5
                	addps xmm2 xmm6
			mulps xmm3 xmm7
		dec ecx
		jnz LOOP1
        }
#elif INTEL
		asm(".intel_syntax noprefix\n");
			asm("movups xmm0, [z0]\n");
			asm("movups xmm1, [z1]\n");
			asm("movups xmm2, xmm1\n");
			asm("movups xmm3, xmm0\n");
			asm("movups xmm4, xmm1\n");
			asm("movups xmm5, xmm0\n");
			asm("movups xmm6, xmm1\n");
			asm("movups xmm7, xmm0\n");
			asm("mov ecx, 4000000000\n");
			asm("LOOP1:\n");
				asm("addps xmm0, xmm4\n");
				asm("mulps xmm1, xmm5\n");
                		asm("addps xmm2, xmm6\n");
                		asm("mulps xmm3, xmm7\n");
      			asm("dec ecx\n");
			asm("jnz LOOP1\n");
#else
		__asm__(
			"movups (z0), %xmm0\n"
			"movups (z1), %xmm1\n"
			"movups %xmm1, %xmm2\n"
			"movups %xmm0, %xmm3\n"
			"movups %xmm1, %xmm4\n"
			"movups %xmm0, %xmm5\n"
			"movups %xmm1, %xmm6\n"
			"movups %xmm0, %xmm7\n"
			"movl $0xee6b2800, %ecx\n"
			"LOOP1:\n"
				"addps %xmm4, %xmm0\n"
				"mulps %xmm5, %xmm1\n"
				"addps %xmm6, %xmm2\n"
				"mulps %xmm7, %xmm3\n"
			"dec %ecx\n"
			"jnz LOOP1\n"
		);
#endif
	pthread_exit(0);
}

void *MemWork1(void *mem_arg)
{
	(void)memcpy( mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
	(void)memcpy( mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].block_size);
	(void)memcpy( mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
	(void)memcpy( mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].block_size);
	(void)memcpy( mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
	(void)memcpy( mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].block_size);

	pthread_exit(0);
}

void *MemWork2(void *mem_arg)
{
	(void)memset (mem_table[*(int *)mem_arg].b, 124, mem_table[*(int *)mem_arg].block_size);
	(void)memset (mem_table[*(int *)mem_arg].a, 122, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].b, 12, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].a, 18, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].b, 24, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].a, 22, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].b, 2, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].a, 4, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].b, 36, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].a, 48, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].b, 96, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].a, 87, mem_table[*(int *)mem_arg].block_size);

	pthread_exit(0);
}

void *MemWork3(void *mem_arg)
{
	(void)memmove( mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
	(void)memmove( mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].block_size);
	(void)memmove( mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
	(void)memmove( mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].block_size);
	(void)memmove( mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
	(void)memmove( mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].block_size);

	pthread_exit(0);
}
