#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>

#ifdef IS32BIT
#define ALGS 7
#else
#define ALGS 8
#endif

#define RANGES 4

#ifndef MAX_SIZE
#define MAX_SIZE 32768
#endif

#ifndef UINT32_MAX
#define UINT32_MAX (4294967295U)
#endif

/*
static void Winning_Alg(uint16_t *wins, uint16_t *winner){
	uint16_t b, c;
	for(b=c=0; b<ALGS; b++){
		if(wins[b] > c){
			*winner = b;
			c = wins[b];
		}
	}
}
*/

/*
	uint16_t a;
	uint16_t b;
	uint16_t c;
	uint16_t d;
//#ifndef uint128_t
typedef struct _UINT128 {
    __int64 LowPart;
    __int64 HighPart;
} UINT128;
//#endif
*/

void Increment_Winners(uint16_t start, uint16_t end, uint16_t **input, uint16_t *output){
	uint16_t a, b;
	uint16_t cur_cycles;
	for(a = start; a<end; a++){
		cur_cycles = UINT16_MAX;
		for(b=0; b<ALGS; b++){
			if(input[a][b] < cur_cycles)
				cur_cycles = input[a][b];
		}
		for(b=0; b<ALGS; b++){
			if(input[a][b] == cur_cycles)
				output[b]++;
		}
	}
}

void Dec_and_Inc(uint16_t *input, uint16_t *output1, uint16_t *output2){
	uint16_t a=UINT16_MAX, b;
	for(b=0; b<ALGS; b++){
		if(input[b] < a)
			a = input[b];
	}
	for(b=0; b<ALGS; b++){
		if(input[b] == a){
			output1[b]--;
			output2[b]++;
		}
	}
}

/*
void Highest(uint16_t *input, uint16_t *output){
	uint16_t a = input[0];
	uint16_t b = 1;
	uint16_t d = 0;
	while(b<ALGS){
		if(input[b] > a){
			a = input[b];
			d = b;
		}
		b++;
	}
	*output = d;
}
*/

/*
static void Print_Alg(uint16_t alg){
	if(alg>(ALGS-1)){
		printf("error");
		return;
	}
	uint8_t n1;
#ifdef IS32BIT
	const char *algs[] = {"byte_loop", "loop", "unrolled_loop", "rep_byte", "rep_4byte", "vector_loop", "libcall"};
#else
	const char *algs[] = {"byte_loop", "loop", "unrolled_loop", "rep_byte", "rep_4byte", "rep_8byte", "vector_loop", "libcall"};
#endif
	printf("\t%s", algs[alg]);
	for(n1=strlen(algs[alg]); n1<14; n1++)
		printf(" ");
}
*/

//void Copy_Algs(uint8_t *dest, uint8_t *src, uint8_t dual){
//	dest[0] = src[0]
//}


#include <xmmintrin.h>
struct Data
{
    union
    {
        uint16_t i[ALGS];
        __m128 v;
    };
};

void memcpy128(__m128* __restrict b, const __m128* __restrict a){
	__m128 t0 = a[0];
	b[0] = t0;
}
void memcpy256(__m128* __restrict b, const __m128* __restrict a){
	__m128 t0 = a[0];
	__m128 t1 = a[1];
	b[0] = t0;
	b[1] = t1;
}

/*
void memcpy128(void *__restrict b, const void *__restrict a){
	unsigned __int128 *s1 = b;
	const unsigned __int128 *s2 = a;
	*s1 = *s2;
}

void memcpy256(void *__restrict b, const void *__restrict a){
	unsigned __int128 *s1 = b;
	const unsigned __int128 *s2 = a;
	*s1++ = *s2++;
	*s1++ = *s2++;
}
*/
void Print_with_spaces(const char *alg, uint16_t length){
	uint16_t a = strlen(alg);
	//uint16_t b = 0;
	printf("%s", alg);
	while(a<length){
		printf(" ");
		a++;
	}
}

int main(int argc, char **argv){
	FILE *in;
	uint16_t a,b,c=0,d=0;
	uint16_t cur_wins = 0;
	uint16_t max_wins = 0;
	//size_t cur_cycles = 0;
	uint16_t winning_starts[RANGES] = {0,0,0,0};
	//uint16_t winning_algs[RANGES] = {0,0,0,0};
	//uint32_t *data[ALGS];
	uint16_t **data;
	//uint32_t *fastest;
	uint16_t read_buffer_size = 64;
	char *read_buffer;
	//uint16_t start_a = 0;
	uint16_t start_b = 0;
	uint16_t start_c = 0;
	uint16_t start_d = 0;
	const uint16_t forced_a_switch = MAX_SIZE/4;
	const uint16_t min_start_a = 0;
	const uint16_t min_start_b = 1;
	const uint16_t min_start_c = 2;
	const uint16_t min_start_d = 3;
	const uint16_t max_end_d = MAX_SIZE;
	//uint8_t alg_a = 0;
	uint16_t winners[RANGES] = {0,0,0,0};

//	uint16_t wins[RANGES-1][ALGS];
//	uint16_t wins_T[RANGES][ALGS];
//	uint16_t wins_B[ALGS];

struct Data wins[RANGES-1];
struct Data wins_T[RANGES];
struct Data wins_B;

//abcd [4][8]
//abc [3][8]
//ab [2][8]

//	unsigned __int128 wins_T[RANGES];
//	unsigned __int128 wins[RANGES-1];
//	unsigned __int128 wins_B[RANGES-2];

#ifdef IS32BIT
	const char *algs[] = {"byte_loop", "loop", "unrolled_loop", "rep_byte", "rep_4byte", "vector_loop", "libcall"};
#else
	const char *algs[] = {"byte_loop", "loop", "unrolled_loop", "rep_byte", "rep_4byte", "rep_8byte", "vector_loop", "libcall"};
#endif


	//if(argc != 1)
	//	max_end_d = atoi(argv[1]);
	printf("Testing to %u\n", max_end_d);

	data = (uint16_t **)malloc(sizeof(uint16_t *)*max_end_d);
	for(a=0; a<max_end_d; a++){
		data[a] = (uint16_t *)malloc(sizeof(uint16_t)*ALGS);
	}
	//fastest = (uint32_t *)malloc(sizeof(uint32_t)*max_end_d);
	read_buffer = (char *)malloc(read_buffer_size);
	(void)memset( (void *)read_buffer, '\0', read_buffer_size);

	(void)memset((void *)wins, '\0', sizeof(uint16_t)*(RANGES-1)*ALGS);
	(void)memset((void *)wins_T, '\0', sizeof(uint16_t)*RANGES*ALGS);

	for(a=0; a<8; a++){
#ifdef IS32BIT
		if(a == 5)
			continue;
#endif
		(void)sprintf(read_buffer, "results/r%u.txt", a+1);
		in = fopen(read_buffer, "r");
		b = 0;
		(void)memset( (void *)read_buffer, '\0', read_buffer_size);
		while( fgets(read_buffer, read_buffer_size, in) != NULL){
			if(b == max_end_d)
				break;
			if(atoi(read_buffer) > UINT16_MAX)
				data[b][c] = UINT16_MAX;
			else
				data[b][c] = atoi(read_buffer);

			b++;
		}
		fclose(in);
		c++;
	}
	free(read_buffer);

	//calculate wins for each alg of each range on start (NOTE: will be 1 position checked unless mins_start_* changed)
	//except range d where max_end_d - min_start_d are checked

	Increment_Winners(min_start_d, max_end_d, data, wins_T[min_start_d].i);
	Increment_Winners(min_start_c, min_start_d, data, wins[min_start_c].i);
	Increment_Winners(min_start_b, min_start_c, data, wins[min_start_b].i);
	Increment_Winners(min_start_a, min_start_b, data, wins[min_start_a].i);


//wins_T[min_start_d].i

//	(void)memcpy((void *)wins_B, (const void *)wins[min_start_a], sizeof(uint16_t)*ALGS);
	//Copy_Algs((uint8_t *)wins_B, (uint8_t *)wins[min_start_a], ALGS);
	memcpy128((void *)&wins_B.v, (const void *)&wins[min_start_a].v);

	//start range d
	for(start_d = min_start_d; start_d<max_end_d; start_d++){
//		printf("%u/MAX_SIZE\n", start_d);
		//(void)memcpy((void *)wins_T[min_start_c], (const void *)wins[min_start_c], sizeof(uint16_t)*ALGS);
		//Copy_Algs((uint8_t *)wins_T[min_start_c], (uint8_t *)wins[min_start_c], ALGS);
		memcpy128((void *)&wins_T[min_start_c].v, (const void *)&wins[min_start_c].v);
		//pick d winning alg up front
/*
		for(b=c=0; b<ALGS; b++){
			if(wins_T[min_start_d][b] > c){
				winners[min_start_d] = b;
				c = wins_T[min_start_d][b];
			}
		}

*/
		c = wins_T[min_start_d].i[0];
		for(b=1; b<ALGS; b++){
			if(wins_T[min_start_d].i[b] > c){
				c = wins_T[min_start_d].i[b];
				d = b;
			}
		}
		winners[min_start_d] = d;

//Highest(wins_T[min_start_d], &winners[min_start_d]);

		//wins_B
		//(void)memcpy((void *)wins[min_start_b], (const void *)wins_B, sizeof(uint16_t)*ALGS);
		//Copy_Algs((uint8_t *)wins[min_start_b], (uint8_t *)wins_B, ALGS);
		memcpy128((void *)&wins[min_start_b].v, (const void *)&wins_B.v);
		//start range c
		for(start_c = min_start_c; start_c<start_d; start_c++){
			//(void)memcpy((void *)wins_T[0], (const void *)wins[0], sizeof(uint16_t)*ALGS);
			//(void)memcpy((void *)wins_T[1], (const void *)wins[1], sizeof(uint16_t)*ALGS);

			//(void)memcpy((void *)wins_T, (const void *)wins, sizeof(uint16_t)*ALGS*2);
			//Copy_Algs((uint8_t *)wins_T, (uint8_t *)wins, ALGS*2);
			memcpy256((void *)&wins_T[0].v, (const void *)&wins[0].v);

			// *((__uint128_t *)wins_T[0]) = *((__uint128_t *)wins[0]);
			// *((__uint128_t *)wins_T[1]) = *((__uint128_t *)wins[1]);

			c = wins_T[min_start_c].i[0];
			for(b=1; b<ALGS; b++){
				if(wins_T[min_start_c].i[b] > c){
					c = wins_T[min_start_c].i[b];
					d = b;
				}
			}
			winners[min_start_c] = d;
/*
			c = 0;
			//pick c winning alg
			for(b=0; b<ALGS; b++){
				if(wins_T[min_start_c][b] > c){
					winners[min_start_c] = b;
					c = wins_T[min_start_c][b];
				}
			}

*/
//Highest(wins_T[min_start_c], &winners[min_start_c]);
//cur_wins = wins_T[min_start_c][winners[2]]+wins_T[min_start_d][winners[3]];
			//start range b
			for(start_b = min_start_b; start_b<start_c; start_b++){
				if(start_b > forced_a_switch)
					break;

				//pick b and a winning algs
				for(a=b=c=0; b<ALGS; b++){
					if(wins_T[min_start_a].i[b] > c){
						winners[min_start_a] = b;
						c = wins_T[min_start_a].i[b];
					}
					if(wins_T[min_start_b].i[b] < a)
						continue;
					winners[min_start_b] = b;
					a = wins_T[min_start_b].i[b];
				}

/*
//for(a=0; a<min_start_c; a++){
if(wins_T[min_start_a][0] > wins_T[min_start_a][1]){
c=wins_T[min_start_a][0];
winners[min_start_a] = 0;
}

//c=0;
for(b=0; b<ALGS; b++){
if(wins_T[a][b] > c){
winners[a] = b;
c = wins_T[a][b];
}
}
//}

wins_T[a][b]
wins_T[a][b]
wins_T[a][b]
wins_T[a][b]
wins_T[a][b]
wins_T[a][b]
wins_T[a][b]
wins_T[a][b]
*/

//c=0;
//for(b=0; b<ALGS; b++){
//if(wins_T[min_start_b][b] > c){
//winners[min_start_b] = b;
//c = wins_T[min_start_b][b];
//}
//}


/*
				a = wins_T[min_start_a][0];
				b = wins_T[min_start_b][0];
				for(c=1; c<ALGS; c++){
					if(wins_T[min_start_a][c] > a){
						a = wins_T[min_start_a][c];
						winners[min_start_a] = c;
					}

					if(wins_T[min_start_b][c] < b)
						continue;
					b = wins_T[min_start_b][c];
					winners[min_start_b] = c;
				}
*/
				//winners[min_start_a] = d;
				//winners[min_start_b] = e;
//				cur_wins = d+e+
//wins_T[min_start_c][winners[2]]+wins_T[min_start_d][winners[3]];
				
				for(a=cur_wins=0; a<RANGES; a++)
					cur_wins += wins_T[a].i[winners[a]];

				Dec_and_Inc(data[start_b], wins_T[min_start_b].i, wins_T[min_start_a].i);
				
				if(cur_wins < max_wins)
					continue;
				//winners[min_start_a] = d;
				//winners[min_start_b] = e;
				//winning_starts[0] = start_a;
				winning_starts[min_start_b] = start_b;
				winning_starts[min_start_c] = start_c;
				winning_starts[min_start_d] = start_d;
				//for(a=0; a<RANGES; a++)
				//	winning_algs[a] = winners[a];
				//(void)memcpy((void *)winning_algs, (const void *)winners, sizeof(uint16_t)*RANGES);
				max_wins = cur_wins;
			}
			//selectively decremember alg in range c that gained a win
			Dec_and_Inc(data[start_c], wins_T[min_start_c].i, wins[min_start_b].i);
		}
		//selectively decrement alg that lost a win
		Dec_and_Inc(data[start_d], wins_T[min_start_d].i, wins[min_start_c].i);
	}


	printf("\nTotal wins for all data (ties count as win)\n");
	Increment_Winners(min_start_a, max_end_d, data, wins_B.i);
	for(a=d=0; a<ALGS; a++){
		//printf("\t%s", algs[a]);
		printf("\t");
		Print_with_spaces(algs[a], 15);
		printf("%u\n", wins_B.i[a]);
		d += wins_B.i[a];
	}
	printf("\ntotal: %u\n\n", d);
	//Print_with_spaces("total:", 15);
	//printf("%u\n\n", d);

	printf("Config with most wins/ties:\n");
	printf("\tstart\twins\talg/s\n");
	d = 0;
	for(a=0; a<RANGES; a++){
		if(a != min_start_d)
			c = winning_starts[a+1];
		else
			c = max_end_d;
		(void)memset((void *)wins_B.i, '\0', sizeof(uint16_t)*ALGS);
		Increment_Winners(winning_starts[a], c, data, wins_B.i);

                max_wins = 0;
                for(b=0; b<ALGS; b++){
                        if(wins_B.i[b] > max_wins)
                                max_wins = wins_B.i[b];
                }
                printf("\t%u\t", winning_starts[a]);

		printf("%u\t", max_wins);
		d+= max_wins;
                for(b=0; b<ALGS; b++){
                        if(wins_B.i[b] == max_wins)
                                printf("%s(%u) ", algs[b], b);
                }
                printf("\n");
        }
	printf("\ntotal: %u\n", d);
	//printf("Total wins for all data\n");
	//Increment_Winners(min_start_d, max_end_d, data, wins_T[min_start_d]);


	for(a=0; a<max_end_d; a++)
		free(data[a]);
	free(data);

	return 0;
}
