#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <pthread.h>
#include <math.h>
#define __STDC_FORMAT_MACROS
#include <inttypes.h>

#ifdef IS32BIT
#define ALGS 7
#else
#define ALGS 8
#endif

#define RANGES 4

#ifndef MAX_SIZE
#define MAX_SIZE 1024
#endif

uint8_t ties[MAX_SIZE];
uint16_t A_Wins[MAX_SIZE-2];
uint16_t wins_BS[ALGS];
const uint16_t min_start_a = 0;
const uint16_t min_start_b = 1;
const uint16_t min_start_c = 2;
const uint16_t min_start_d = 3;

struct Thread_I{
	uint16_t start_d;
	uint16_t end_d;
	uint16_t win_start_b;
	uint16_t win_start_c;
	uint16_t win_start_d;
	uint16_t max_wins;
};

struct Thread_I *Actions_t_info;

void SET_ONE(const uint16_t pos, const uint8_t bit){
	uint8_t cur = ties[pos];
	uint8_t b0 = (bit == 0) ? 1   : (((cur<<7) & 128) >= 128);
	uint8_t b1 = (bit == 1) ? 2   : (((cur<<6) & 128) >= 128)<<1;
	uint8_t b2 = (bit == 2) ? 4   : (((cur<<5) & 128) >= 128)<<2;
	uint8_t b3 = (bit == 3) ? 8   : (((cur<<4) & 128) >= 128)<<3;
	uint8_t b4 = (bit == 4) ? 16  : (((cur<<3) & 128) >= 128)<<4;
	uint8_t b5 = (bit == 5) ? 32  : (((cur<<2) & 128) >= 128)<<5;
	uint8_t b6 = (bit == 6) ? 64  : (((cur<<1) & 128) >= 128)<<6;
	uint8_t b7 = (bit == 7) ? 128 : (((cur) & 128)  >= 128)<<7;

	ties[pos] = b0+b1+b2+b3+b4+b5+b6+b7;
}

uint8_t IS_SET(const uint16_t pos, const uint8_t bit){
	uint8_t cur = ties[pos];
	cur <<= (7-bit);
	cur &= 128;

	if(cur >= 128)
		return 1;
	else
		return 0;
}

void Increment_Winners(uint16_t start, const uint16_t end, uint16_t *output){
	uint8_t a;
	(void)memset((void *)output, '\0', sizeof(uint16_t)*ALGS);
	while(start<end){
		for(a=0; a<ALGS; a++){
			if(IS_SET(start, a))
				output[a]++;
		}
		start++;
	}
}

void Print_with_spaces(const char *alg, const uint16_t length){
	uint16_t a = strlen(alg);
	printf("%s", alg);
	while(a<length){
		printf(" ");
		a++;
	}
}

//uint64_t Combs(uint16_t range){
//	return (((uint64_t)pow(range, min_start_d)/6 - pow(range, min_start_c)/min_start_c + range/min_start_d));
//}

void *Range_Action(void *tid){
	const uint16_t t = *(uint16_t *)tid;
	uint8_t b;
	uint16_t a,c=0; //,d=0;
	uint16_t wt_d, wt_c;
	uint16_t max_wins = 0;
	uint16_t start_b = 0;
	uint16_t start_c = 0;
	uint16_t start_d = 0;
	uint16_t strongest_ab = 0;
//	uint16_t win_start_b = 0;
//	uint16_t win_start_c = 0;
//	uint16_t win_start_d = 0;
	//const uint16_t min_start_ta = 0;
	//const uint16_t min_start_tb = min_start_b;
	//const uint16_t min_start_tc = min_start_c;
	const uint16_t min_start_td = Actions_t_info[t].start_d;
	const uint16_t max_end_d = Actions_t_info[t].end_d;

	uint16_t wins_B[ALGS];
	uint16_t wins_C[ALGS];
	uint16_t wins_D[ALGS];
	uint16_t wins_TB[ALGS];
	uint16_t wins_TC[ALGS];
#ifdef VERBOSE
	uint64_t total_comb = 0;
#endif
	//calculate wins for each alg of each range on start (NOTE: will be 1 position checked unless min_start_* changed)
	//except range d where max_end_d - min_start_d are checked
	Increment_Winners(min_start_c, min_start_td, wins_C);
	Increment_Winners(min_start_td, MAX_SIZE, wins_D);

	//tmp2 = min_start_d - min_start_a;
	//pow(tmp2, 3)/6 - pow(tmp2, 2)/2 + tmp2/3;
	//pow(X[n], 2)/2 - 1.5*X[n] + 1;

	//start range d
	for(start_d = min_start_td; start_d<max_end_d; start_d++){
		//printf("%u/MAX_SIZE\n", start_d);
		(void)memcpy((void *)wins_TC, (const void *)wins_C, sizeof(uint16_t)*ALGS);
		(void)memcpy((void *)wins_B, (const void *)wins_BS, sizeof(uint16_t)*ALGS);
		//pick d winning alg
		for(wt_d=b=0; b<ALGS; b++){
			if(wins_D[b] > wt_d)
				wt_d = wins_D[b];
			if(IS_SET(start_d, b)){
				wins_D[b]--;
				wins_C[b]++;
			}
		}
		//start range c
		for(start_c = min_start_c; start_c<start_d; start_c++){
			(void)memcpy((void *)wins_TB, (const void *)wins_B, sizeof(uint16_t)*ALGS);
			for(wt_c=b=0; b<ALGS; b++){
				if(wins_TC[b] > wt_c)
					wt_c = wins_TC[b];
				if(IS_SET(start_c, b)){
					wins_TC[b]--;
					wins_B[b]++;
				}
			}
			wt_c += wt_d;
			a = 0;
			strongest_ab = 0;
			//start range b
			for(start_b = min_start_b; start_b<start_c; start_b++){
				//pick b and a winning algs
				for(c=b=0; b<ALGS; b++){
					if(wins_TB[b] > c)
						c = wins_TB[b];
					if(IS_SET(start_b, b))
						wins_TB[b]--;
				}
				c += A_Wins[start_b];
				if(c > strongest_ab){
					strongest_ab = c;
					a = start_b;
				}
				//b = c + A_Wins[start_b] + wt_c;
				//if(b > max_wins){
				//	max_wins = b;
				//	win_start_b = start_b;
				//	win_start_c = start_c;
				//	win_start_d = start_d;
				//}

#ifdef VERBOSE
				total_comb++;
#endif
			}
			c = strongest_ab + wt_c;
			if(c > max_wins){
				max_wins = c;
				Actions_t_info[t].win_start_b = a;
				Actions_t_info[t].win_start_c = start_c;
				Actions_t_info[t].win_start_d = start_d;
				//win_start_b = a;
				//win_start_c = start_c;
				//win_start_d = start_d;
			}
		}
	}

	//Actions_t_info[t].win_start_b = win_start_b;
	//Actions_t_info[t].win_start_c = win_start_c;
	//Actions_t_info[t].win_start_d = win_start_d;
	Actions_t_info[t].max_wins = max_wins;
#ifdef VERBOSE
	printf("thread: %u combs: %zu\n", t, total_comb);
#endif
	pthread_exit(NULL);
}

int main(int argc, char **argv){
	FILE *in;
	uint16_t a,b,c=0; //,d=0;
	//uint16_t wt_d;
	uint16_t max_wins = 0;
	uint64_t slice_a = 0, slice_b, tmp1; //tmp_a, tmp_c;
	uint16_t winning_starts[RANGES] = {0,0,0,0};
	uint16_t **data;
	uint16_t read_buffer_size = 16;
	char *read_buffer;
	uint16_t start_b = 0;

	//const uint16_t min_start_a = 0;
	//const uint16_t min_start_b = 1;
	//const uint16_t min_start_d = 3;

	const uint16_t max_end_d = MAX_SIZE;
	uint16_t wins_B[ALGS];

#ifdef IS32BIT
	const char *algs[] = {"byte_loop", "loop", "unrolled_loop", "rep_byte", "rep_4byte", "vector_loop", "libcall"};
#else
	const char *algs[] = {"byte_loop", "loop", "unrolled_loop", "rep_byte", "rep_4byte", "rep_8byte", "vector_loop", "libcall"};
#endif
	uint16_t Threads = 1;
	pthread_t *Actions_thread_ptrs;
	uint16_t *Actions_t_ids;
	uint16_t Tcount;

	//if(argc != 1)
	//      max_end_d = atoi(argv[1]);
	printf("Testing to %u\n", max_end_d);

	if(argc > 1)
		Threads = atoi(argv[1]);

	data = (uint16_t **)malloc(sizeof(uint16_t *)*max_end_d);
	for(a=min_start_a; a<max_end_d; a++){
		data[a] = (uint16_t *)malloc(sizeof(uint16_t)*ALGS);
	}
	read_buffer = (char *)malloc(read_buffer_size);

	for(a=0; a<8; a++){
#ifdef IS32BIT
		if(a == 5)
			continue;
#endif
		(void)sprintf(read_buffer, "results/r%u.txt", a+1);
		in = fopen(read_buffer, "r");
		b = 0;
		while( fgets(read_buffer, read_buffer_size, in) != NULL){
			if(b == max_end_d)
				break;
			if(atoi(read_buffer) < UINT16_MAX)
				data[b][c] = atoi(read_buffer);
			else
				data[b][c] = UINT16_MAX;
			b++;
		}
		fclose(in);
		c++;
	}
	free(read_buffer);

	for(a = min_start_a; a<max_end_d; a++){
		c = UINT16_MAX;
		for(b=0; b<ALGS; b++){
			if(data[a][b] < c)
				c = data[a][b];
		}
		for(b=0; b<ALGS; b++){
			if(data[a][b] == c)
				SET_ONE(a, b);
		}
		free(data[a]);
	}
	free(data);

	Increment_Winners(min_start_a, min_start_b, wins_B);
	for(start_b = min_start_b; start_b<MAX_SIZE-2; start_b++){
		for(c=b=0; b<ALGS; b++){
			if(wins_B[b] > c){
				c = wins_B[b];
				a = b;
			}
			if(IS_SET(start_b, b))
				wins_B[b]++;
		}
		A_Wins[start_b] = c;
	}

	Increment_Winners(min_start_b, min_start_c, wins_BS);

	Actions_thread_ptrs = (pthread_t *)malloc(sizeof(pthread_t) * Threads );
	Actions_t_ids = (uint16_t *)malloc( sizeof(uint16_t) * Threads );
	Actions_t_info = (struct Thread_I *)malloc( sizeof(struct Thread_I) * Threads );
	(void)memset((void *)Actions_t_info, '\0', sizeof(struct Thread_I) * Threads );

	//tmp2 = max-min;
	//pow(tmp2, 3)/6 - pow(tmp2, 2)/2 + tmp2/3;
	//pow(Y, 3)/6 - pow(Y, 2)/2 + Y/3 = (pow(MAX_SIZE, 3)/6 - pow(MAX_SIZ, 2)/2 + MAX_SIZ/3) / Threads;
	tmp1 = pow(MAX_SIZE, 3)/6 - pow(MAX_SIZE, 2)/2 + MAX_SIZE/3;
	slice_a = (uint64_t)floor(tmp1/Threads);
#ifdef VERBOSE
	slice_b = (uint64_t)ceil(pow(MAX_SIZE, 3)/6 - pow(MAX_SIZE, 2)/2 + (MAX_SIZE)/3);
	printf("total combinations to check: %"PRIu64"\n", slice_b);
	printf("total combinations per thread: %"PRIu64"\n", slice_a);
	slice_b -= (uint64_t)ceil(pow(MAX_SIZE-1, 3)/6 - pow(MAX_SIZE-1, 2)/2 + (MAX_SIZE-1)/3);
	printf("last level granularity: %"PRIu64"\n", slice_b);

	//printf("total combinations to check: %"PRIu64"\n", slice_b);
	//printf("total combinations per thread: %"PRIu64"\n", slice_a);
	//printf("total combinations to check: %"PRIu64"\n", Combs(MAX_SIZE-3));
#endif

	a = min_start_d;
	for(b=0; b<Threads; b++){
		Actions_t_ids[b] = b;
		Actions_t_info[b].start_d = a;
		if(a > (uint16_t)MAX_SIZE){
			printf("Error threads count too high, try %u or lower\n", b);
			free(Actions_thread_ptrs);
			free(Actions_t_ids);
			free(Actions_t_info);
			exit(1);
		}
		c = a+1;
		slice_b = 0;

		while(slice_b<slice_a){
			tmp1 = pow(c, 3)/6 - pow(c, 2)/2 + c/3;
			slice_b = (uint64_t)( (tmp1 - (pow(a, 3)/6 - pow(a, 2)/2 + a/3)) );
			c++;
		}
		slice_b -= slice_a;
		tmp1 = pow(c-1, 3)/6 - pow(c-1, 2)/2 + (c-1)/3;
		tmp1 -= (uint64_t)(pow(a, 3)/6 - pow(a, 2)/2 + a/3);
		if(abs(tmp1-slice_a) < slice_b)
			c--;
		//if(b!=Threads-1)
		//	c-=2; //-= Threads-b;
		a = c;
		Actions_t_info[b].end_d = a;
	}
	Actions_t_info[Threads-1].end_d = (uint16_t)MAX_SIZE;

double X, Y, T, M, A, B, C;
M = pow(MAX_SIZE, 3)/6 - pow(MAX_SIZE, 2)/2 + (MAX_SIZE)/3;
T = Threads;
//M /= T;
Y = 0;
for(b=0; b<Threads; b++){
//T = b;
//X = stop -(-3*Y-3)/3
//X = -(-3*Y-3)/3 + pow( ((sqrt(243*pow(M,2)-pow(T,2))/pow(3, 3/2)*T) + 3*M/T), 1/3) + 1 / 3* pow( (( sqrt(243*pow(M,2)-pow(T,2)) /pow(3,3/2)*T) + 3*M/T), 1/3);

A = -1*(-3*Y-3)/3;
B = pow( ((sqrt(243*pow(M,2)-pow(T,2))/pow(3, 3/2)*T) + 3*M/T), 1/3);
C = 1 / 3* pow( (( sqrt(243*pow(M,2)-pow(T,2)) /pow(3,3/2)*T) + 3*M/T), 1/3);

//B = pow( ((sqrt(243*pow(M,2)-1)/pow(3, 3/2)) + 3*M), 1/3);
//C = 1 / 3* pow( (( sqrt(243*pow(M,2)-1) /pow(3,3/2)) + 3*M), 1/3);

X = A+B+C;

//M/T = ((X-Y)^3/6 - (X-Y)^2/2 + (X-Y)/3);
//pow(X-Y, 3)/6 - pow(X-Y, 2)/2 + (X-Y)/3;

printf("start: %lf stop: %lf combos: %lf\n", Y, X, M);
Y = X;
}


#ifdef VERBOSE
	for(Tcount = 0; Tcount < Threads; Tcount++){
		printf("T: %u end: %u\n", Tcount, Actions_t_info[Tcount].end_d);
	}
#endif
	printf("Launching %u Action threads.\n", Threads);
	for(Tcount = 0; Tcount < Threads; Tcount++){
		if(pthread_create(&Actions_thread_ptrs[Tcount], NULL, Range_Action, (void *)&Actions_t_ids[Tcount]) != 0){
			printf("Error: pthread_create failed for action thread: %u !\n", Tcount);
			printf("\tExiting!\n");
			free(Actions_thread_ptrs);
			free(Actions_t_ids);
			free(Actions_t_info);
			exit(1);
		}
	}


	for(Tcount=0; Tcount < Threads; Tcount++){
		if(pthread_join(Actions_thread_ptrs[Tcount], NULL) != 0){
			printf("Error: pthread_join failed for action thread: %u !\n", Tcount);
			printf("\tExiting!\n");
			free(Actions_thread_ptrs);
			free(Actions_t_ids);
			free(Actions_t_info);
			exit(1);
		}
	}

	free(Actions_thread_ptrs);
	free(Actions_t_ids);

	a = 0;
	max_wins = 0;
	//grab thread with most max_wins
	for(b=0; b<Threads; b++){
		if(Actions_t_info[b].max_wins > max_wins){
			max_wins = Actions_t_info[b].max_wins;
			a = b;
		}
	}

	winning_starts[min_start_b] = Actions_t_info[a].win_start_b;
	winning_starts[min_start_c] = Actions_t_info[a].win_start_c;
	winning_starts[min_start_d] = Actions_t_info[a].win_start_d;

	free(Actions_t_info);

	printf("\nTotal wins for all data (ties count as win)\n");
	Increment_Winners(min_start_a, max_end_d, wins_B);
	for(a=c=0; a<ALGS; a++){
		printf("\t");
		Print_with_spaces(algs[a], 15);
		printf("%u\n", wins_B[a]);
		c += wins_B[a];
	}
	printf("\ntotal: %u\n\n", c);
	printf("Config with most wins/ties:\n");
	printf("\tstart\twins\talg/s\n");

	for(start_b=a=0; a<RANGES; a++){
		if(a != min_start_d)
			c = winning_starts[a+1];
		else
			c = max_end_d;
		Increment_Winners(winning_starts[a], c, wins_B);

		max_wins = 0;
		for(b=0; b<ALGS; b++){
			if(wins_B[b] > max_wins)
				max_wins = wins_B[b];
		}
		printf("\t%u\t%u\t", winning_starts[a], max_wins);
		start_b += max_wins;
		for(b=0; b<ALGS; b++){
			if(wins_B[b] == max_wins)
			printf("%s(%u) ", algs[b], b);
		}
		printf("\n");
	}
	printf("\ntotal: %u\n", start_b);

	return 0;
}
