/*
    lsbench is a benchmark project.
    Copyright (C) 2007-2010  sterling pickens

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <pthread.h>
#include <time.h>
#include <sys/time.h>
#include <string.h>
#include <errno.h>
#include "error.h"
#include "homedir.h"

extern int errno;
struct timeval starttime,endtime;

float *a;
float *b;

float z0[4];
float z1[4];

struct mem_chunk{
	int thread_id;
	long block_size;
	//long asize;
	float *a;
	float *b;
}*mem_table;

void *FloatWork(void *null);
void *MemWork1(void *mem_arg);
void *MemWork2(void *mem_arg);
void *MemWork3(void *mem_arg);

void usage(void){
        printf("Usage: lsbench [threads] [max mem MB]\n");
        printf("        -h: this help menu\n");
        printf("\n");
}

int main(int argc, char **argv){
	set_invocation(argv[0]);
	int thread_count;
	size_t memtotal;
	size_t memportion;
	size_t num1;
	int c;
	int float_size;
	long asize;
	long block_size;
	double te0;

        if(argc != 3){
                usage();
                exit(1);
        }else{
                for(c=0; c<argc; c++){
                        if( !strcmp(argv[c], "-h") || !strcmp(argv[c], "-H") ){
                                usage();
                                exit(0);
                        }
                }
        
                thread_count = atoi(argv[1]);
                memtotal = atoi(argv[2]);
                if(memtotal == 0 || thread_count == 0){
                        usage();
                        exit(1);
                }
        
		printf("\nlsbench test version 03/04/2010 linuxsociety.org\n");
                printf("\tRunning %d threads\n", thread_count);
                printf("\tUsing %luMB of mem\n\n", memtotal);
	}

	pthread_t *threads = (pthread_t *)malloc(sizeof(pthread_t) * thread_count);
	memportion = (int)(memtotal / 2 / thread_count );

	mem_table = (struct mem_chunk *)malloc( sizeof(struct mem_chunk) * thread_count);
        float_size=sizeof(float);
        asize = (long)(memtotal / 2 * 1024 * 1024 / float_size);
	block_size = (long)(memportion * 1024 * 1024);

	a = malloc(sizeof(float)*asize);
	b = malloc(sizeof(float)*asize);
	//put some random stuff in them
	srand ( time(NULL) );
	for(num1=0; num1<asize; num1++){
		a[num1] = (float)rand();
		b[num1] = (float)rand();
	}
	asize = (long)(memportion * 1024 * 1024 / float_size);
	for(num1=0; num1<thread_count; num1++){	
			//mem_table[num1].asize = asize;
			mem_table[num1].thread_id = num1;
			mem_table[num1].block_size = block_size;
			mem_table[num1].a = &a[num1*asize];
			mem_table[num1].b = &b[num1*asize];
	}


	gettimeofday(&starttime, NULL);
		for(num1=0; num1<thread_count; num1++){
			//mem_table.thread_id[num1]=num1;
			if (pthread_create(&threads[num1], NULL, MemWork1, &mem_table[num1].thread_id) != 0)
				perror("pthread_create"), exit(1);
		}
	

		for(num1=0; num1<thread_count; num1++){
			if (pthread_join(threads[num1], NULL) != 0)
				perror("pthread_join"),exit(1);
		}
	gettimeofday(&endtime, NULL);
	te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
	printf("Running Memory Benchmarks\n");
	// 6 times read+write
	printf("\tmemcpy  (read+write)        :");
	printf("\t%.03lfMB/s\n", (double)memtotal*6/te0);


	gettimeofday(&starttime, NULL);
	for(num1=0; num1<thread_count; num1++){
		if (pthread_create(&threads[num1], NULL, MemWork2, &mem_table[num1].thread_id) != 0)
			perror("pthread_create"), exit(1);
	}
	for(num1=0; num1<thread_count; num1++){
		if (pthread_join(threads[num1], NULL) != 0)
			perror("pthread_join"),exit(1);
	}
	gettimeofday(&endtime, NULL);
	te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
	printf("\tmemset       (write)        :");	
	printf("\t%.03lfMB/s\n", (double)memtotal*6/te0);


	gettimeofday(&starttime, NULL);
	for(num1=0; num1<thread_count; num1++){
		if (pthread_create(&threads[num1], NULL, MemWork3, &mem_table[num1].thread_id) != 0)
			perror("pthread_create"), exit(1);
	}
	for(num1=0; num1<thread_count; num1++){
		if (pthread_join(threads[num1], NULL) != 0)
			perror("pthread_join"),exit(1);
	}
	gettimeofday(&endtime, NULL);
	te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
	printf("\tmemmove (read+write)        :");
	printf("\t%.03lfMB/s\n", (double)memtotal*6/te0);

	free(a);
	free(b);

	printf("\nRunning Floating Point Benchmarks\n");
                
	printf("\tSingle Threaded Performance :");


        for(num1=0; num1<4; num1++){
                srand (time(NULL));
                z0[num1]=rand()+1;
                z1[num1]=rand()-1;
                //z1[num1]=rand()+1;
                //z0[num1]=rand()-1;
        }

	gettimeofday(&starttime, NULL);

	if (pthread_create(&threads[0], NULL, FloatWork, NULL) != 0)
		perror("pthread_create"), exit(1);
	if (pthread_join(threads[0], NULL) != 0)
		perror("pthread_join"),exit(1);

	gettimeofday(&endtime, NULL);
	te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
	te0 = 64/te0;
	printf("\t%.03lfGFLOPS\n", te0);

	printf("\tMulti-Threaded Performance  :");

        for(num1=0; num1<4; num1++){
                srand (time(NULL));
                z0[num1]=rand()+1;
                z1[num1]=rand()-1;
                //z1[num1]=rand()+1;
                //z0[num1]=rand()-1;
        }

	gettimeofday(&starttime, NULL);
        for(num1=0; num1<thread_count; num1++){
                if (pthread_create(&threads[num1], NULL, FloatWork, NULL) != 0)
                        perror("pthread_create"), exit(1);
        }
        for(num1=0; num1<thread_count; num1++){
                if (pthread_join(threads[num1], NULL) != 0)
                        perror("pthread_join"),exit(1);
        }
        gettimeofday(&endtime, NULL);
        te0=((double)(endtime.tv_sec*1000000-starttime.tv_sec*1000000+endtime.tv_usec-starttime.tv_usec))/1000000;
	te0 = (float)64*thread_count/te0;
	printf("\t%.03lfGFLOPS\n\n", te0);

	return 0;
}

void *FloatWork(void *null)
{

#ifdef OSX
        asm{
                movups xmm0 z0
                movups xmm1 z1
		movups xmm2 xmm1
		movups xmm3 xmm0
		movups xmm4 xmm1
		movups xmm5 xmm0
		movups xmm6 xmm1
		movups xmm7 xmm0
                mov ecx 4000000000
                LOOP1:
                	addps xmm0 xmm4
                	mulps xmm1 xmm5
                	addps xmm2 xmm6
			mulps xmm3 xmm7
		dec ecx
		jnz LOOP1
        }
#elif INTEL
		asm(".intel_syntax noprefix\n");
			asm("movups xmm0, [z0]\n");
			asm("movups xmm1, [z1]\n");
			asm("movups xmm2, xmm1\n");
			asm("movups xmm3, xmm0\n");
			asm("movups xmm4, xmm1\n");
			asm("movups xmm5, xmm0\n");
			asm("movups xmm6, xmm1\n");
			asm("movups xmm7, xmm0\n");
			asm("mov ecx, 4000000000\n");
			asm("LOOP1:\n");
				asm("addps xmm0, xmm4\n");
				asm("mulps xmm1, xmm5\n");
                		asm("addps xmm2, xmm6\n");
                		asm("mulps xmm3, xmm7\n");
      			asm("dec ecx\n");
			asm("jnz LOOP1\n");
#else
		__asm__(
			"movups (z0), %xmm0\n"
			"movups (z1), %xmm1\n"
			"movups %xmm1, %xmm2\n"
			"movups %xmm0, %xmm3\n"
			"movups %xmm1, %xmm4\n"
			"movups %xmm0, %xmm5\n"
			"movups %xmm1, %xmm6\n"
			"movups %xmm0, %xmm7\n"
			"movl $0xee6b2800, %ecx\n"
			"LOOP1:\n"
				"addps %xmm4, %xmm0\n"
				"mulps %xmm5, %xmm1\n"
				"addps %xmm6, %xmm2\n"
				"mulps %xmm7, %xmm3\n"
			"dec %ecx\n"
			"jnz LOOP1\n"
		);
#endif
	pthread_exit(0);
}

void *MemWork1(void *mem_arg)
{
	(void)memcpy( mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
	(void)memcpy( mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].block_size);
	(void)memcpy( mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
	(void)memcpy( mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].block_size);
	(void)memcpy( mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
	(void)memcpy( mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].block_size);

	pthread_exit(0);
}

void *MemWork2(void *mem_arg)
{
	(void)memset (mem_table[*(int *)mem_arg].b, 124, mem_table[*(int *)mem_arg].block_size);
	(void)memset (mem_table[*(int *)mem_arg].a, 122, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].b, 12, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].a, 18, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].b, 24, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].a, 22, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].b, 2, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].a, 4, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].b, 36, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].a, 48, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].b, 96, mem_table[*(int *)mem_arg].block_size);
        (void)memset (mem_table[*(int *)mem_arg].a, 87, mem_table[*(int *)mem_arg].block_size);

	pthread_exit(0);
}

void *MemWork3(void *mem_arg)
{
	(void)memmove( mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
	(void)memmove( mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].block_size);
	(void)memmove( mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
	(void)memmove( mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].block_size);
	(void)memmove( mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].block_size);
	(void)memmove( mem_table[*(int *)mem_arg].a, mem_table[*(int *)mem_arg].b, mem_table[*(int *)mem_arg].block_size);

	pthread_exit(0);
}
