#include "globals.h"
#include "sha.h"
#include "salsa.h"
#include "shittify.h"
#include "scrypt.h"


__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(const __global uint4 *restrict input,
volatile __global uint *restrict output,
#if (CLSIZE == 64)
__global uint16 *restrict padcache,
#else
__global uint8 *restrict padcache,
#endif
const uint4 midstate0, const uint4 midstate16, const uint target){





#if (CLSIZE == 64)
	//event_t e = async_work_group_copy(lbuff, gbuff, 512, 0);
    //wait_group_events(1, &e);

//padcache[(get_global_id(0)%CONCURRENT_THREADS)<<1].s0 = 0;
//barrier(CLK_GLOBAL_MEM_FENCE);
	uint16 X[2];
//	U32 X;

#else
	uint8 X[4];
#endif

	uint in0 = input[4].s0;
	uint in1 = input[4].s1;
	uint in2 = input[4].s2;

#ifdef GOFFSET
	uint gid = get_global_id(0);
#else
	uint gid = input[4].s3 + get_global_id(0);
#endif

	uint8 ostate = {0x6a09e667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU, 0x510e527fU, 0x9b05688cU, 0x1F83D9ABU, 0x5BE0CD19U};
	uint8 tstate = ostate;
	uint8 tmpa = {in0, in1, in2, gid, SK00, zero, zero, zero};
	uint8 tmpb = {zero, zero, zero, zero, zero, zero, zero, SK01};
	uint8 tmpc = {midstate0.s0, midstate0.s1, midstate0.s2, midstate0.s3, midstate16.s0, midstate16.s1, midstate16.s2, midstate16.s3};
	uint tsb0, tsb1, tsb2, tsb3, tsb4, tsb5, tsb6, tsb7;
	uint lnum = 1;


/*
volatile const uint fixedWa[8] = {0x428a2f99U,0xd807aa98U,0xf59b89c2U,0xb707775cU,0xad87a3eaU,0xc91b1417U,0xe64fb6a2U,0xe0a1adbeU};
volatile const uint fixedWb[8] = {0xf1374491U,0x12835b01U,0x73924787U,0x0468c23fU,0xbcb1d3a3U,0xc359dce1U,0xe84d923aU,0x7c728e11U};
volatile const uint fixedWc[8] = {0xb5c0fbcfU,0x243185beU,0x23c6886eU,0xe7e72b4cU,0x7b993186U,0xa83253a7U,0xe93a5730U,0x511c78e4U};
volatile const uint fixedWd[8] = {0xe9b5dba5U,0x550c7dc3U,0xa42ca65cU,0x49e1f1a2U,0x562b9420U,0x3b13c12dU,0x09837686U,0x315b45bdU};
volatile const uint fixedWe[8] = {0x3956c25bU,0x72be5d74U,0x15ed3627U,0x4b99c816U,0xbff3ca0cU,0x9d3d725dU,0x078ff753U,0xfca71413U};
volatile const uint fixedWf[8] = {0x59f111f1U,0x80deb1feU,0x4d6edcbfU,0x926d1570U,0xda4b0c23U,0xd9031a84U,0x29833341U,0xea28f96aU};
volatile const uint fixedWg[8] = {0x923f82a4U,0x9bdc06a7U,0xe28217fcU,0xaa0fc072U,0x6cd8711aU,0xb1a03340U,0xd5de0b7eU,0x79703128U};
volatile const uint fixedWh[8] = {0xab1c5ed5U,0xc19bf794U,0xef02488fU,0xadb36e2cU,0x8f337caaU,0x16f58012U,0x6948ccf4U,0x4e1ef848U};
*/




	SHA256(&tmpc, &tmpa, &tmpb);
	tmpb = tmpc^SK02;
	tmpa = SK02;
	SHA256(&ostate, &tmpb, &tmpa);
	tmpb = tmpc^SK03;
	tmpa = SK03;
	SHA256(&tstate, &tmpb, &tmpa);

	//backup tstate
	tsb0 = tstate.s0;
	tsb1 = tstate.s1;
	tsb2 = tstate.s2;
	tsb3 = tstate.s3;
	tsb4 = tstate.s4;
	tsb5 = tstate.s5;
	tsb6 = tstate.s6;
	tsb7 = tstate.s7;

	tmpb.lo = input[0];
	tmpb.hi = input[1];
	tmpc.lo = input[2];
	tmpc.hi = input[3];

	SHA256(&tstate, &tmpb, &tmpc);

	tmpb = zero;
#if (CLSIZE == 64)
	for(uint i=0; i<4; i++){
		//reset a
		tmpa.s0 = in0;
		tmpa.s1 = in1;
		tmpa.s2 = in2;
		tmpa.s3 = gid;
		tmpa.s4 = lnum++;
		tmpa.s5 = SK00;
		tmpa.s6 = zero;
		tmpa.s7 = zero;
		//set b
		tmpb.s0 = zero;
		tmpb.s7 = SK04;
		//set c
		tmpc = tstate;
		SHA256(&tmpc, &tmpa, &tmpb);
		//set a
		tmpa = ostate;
		//set b
		tmpb.s0 = SK00;
		tmpb.s7 = SK05;
		SHA256(&tmpa, &tmpc, &tmpb);

		if(i&one)
			X[i>>1].hi = tmpa;
		else
			X[i>>1].lo = tmpa;
	}
#else
	for(uint i=0; i<4; i++){
		//reset a
		tmpa.s0 = in0;
		tmpa.s1 = in1;
		tmpa.s2 = in2;
		tmpa.s3 = gid;
		tmpa.s4 = lnum++;
		tmpa.s5 = SK00;
		tmpa.s6 = zero;
		tmpa.s7 = zero;
		//set b
		tmpb.s0 = zero;
		tmpb.s7 = SK04;
		//set c
		tmpc = tstate;
		SHA256(&tmpc, &tmpa, &tmpb);
		//set a
		tmpa = ostate;
		//set b
		tmpb.s0 = SK00;
		tmpb.s7 = SK05;
		SHA256(&tmpa, &tmpc, &tmpb);
		X[i] = tmpa;
	}
#endif

#if (CLSIZE == 64)
	scrypt_core(X, padcache);
#else
	shittify(X);
	scrypt_core(X, padcache);
	unshittify(X);
#endif

	tstate.s0 = tsb0;
	tstate.s1 = tsb1;
	tstate.s2 = tsb2;
	tstate.s3 = tsb3;
	tstate.s4 = tsb4;
	tstate.s5 = tsb5;
	tstate.s6 = tsb6;
	tstate.s7 = tsb7;

#if (CLSIZE == 64)
	for(uint i=0; i<2; i++){
		tmpa = X[i].lo;
		tmpc = X[i].hi;
		SHA256(&tstate, &tmpa, &tmpc);
	}
#else
	for(uint i=0; i<4; i+=2)
		SHA256(&tstate, X+i, X+i+1);
#endif

	tsb0 = tstate.s0;
	tsb1 = tstate.s1;
	tsb2 = tstate.s2;
	tsb3 = tstate.s3;
	tsb4 = tstate.s4;
	tsb5 = tstate.s5;
	tsb6 = tstate.s6;
	tsb7 = tstate.s7;

#define A tsb0
#define B tsb1
#define C tsb2
#define D tsb3
#define E tsb4
#define F tsb5
#define G tsb6
#define H tsb7
	for(uint i=0; i<4; i++){
		FXRD(A,B,C,D,E,F,G,H, fixedW[i].s0);
		FXRD(H,A,B,C,D,E,F,G, fixedW[i].s1);
		FXRD(G,H,A,B,C,D,E,F, fixedW[i].s2);
		FXRD(F,G,H,A,B,C,D,E, fixedW[i].s3);
		FXRD(E,F,G,H,A,B,C,D, fixedW[i].s4);
		FXRD(D,E,F,G,H,A,B,C, fixedW[i].s5);
		FXRD(C,D,E,F,G,H,A,B, fixedW[i].s6);
		FXRD(B,C,D,E,F,G,H,A, fixedW[i].s7);

        FXRD(A,B,C,D,E,F,G,H, fixedW[i].s8);
        FXRD(H,A,B,C,D,E,F,G, fixedW[i].s9);
        FXRD(G,H,A,B,C,D,E,F, fixedW[i].sa);
        FXRD(F,G,H,A,B,C,D,E, fixedW[i].sb);
        FXRD(E,F,G,H,A,B,C,D, fixedW[i].sc);
        FXRD(D,E,F,G,H,A,B,C, fixedW[i].sd);
        FXRD(C,D,E,F,G,H,A,B, fixedW[i].se);
        FXRD(B,C,D,E,F,G,H,A, fixedW[i].sf);


	}
#undef A
#undef B
#undef C
#undef D
#undef E
#undef F
#undef G
#undef H

	tstate.s0 += tsb0;
	tstate.s1 += tsb1;
	tstate.s2 += tsb2;
	tstate.s3 += tsb3;
	tstate.s4 += tsb4;
	tstate.s5 += tsb5;
	tstate.s6 += tsb6;
	tstate.s7 += tsb7;

	SHA256(&ostate, &tstate, &tmpb);

	bool found = (EndianSwapa((ostate.s7)) <= target);
	if(found)
		SETFOUND(gid);
}