#include "globals.h"
#include "sha.h"
#include "transform.h"
#include "scrypt.h"


__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
//__kernel void search(__global const uint4 *restrict input,
__kernel void search(const __global uint *restrict input,
volatile __global uint *restrict output, __global CLO *restrict padcache, const uint4 midstate0, const uint4 midstate16, const uint target){

	vo X[32];

	vo gid = get_global_id(0);
#ifdef VECTORS16
	gid.s1 += 1;
	gid.s2 += 2;
	gid.s3 += 3;
	gid.s4 += 4;
	gid.s5 += 5;
	gid.s6 += 6;
	gid.s7 += 7;
	gid.s8 += 8;
	gid.s9 += 9;
	gid.sa += 10;
	gid.sb += 11;
	gid.sc += 12;
	gid.sd += 13;
	gid.se += 14;
	gid.sf += 15;
#elif defined(VECTORS8)
	gid.s1 += 1;
	gid.s2 += 2;
	gid.s3 += 3;
	gid.s4 += 4;
	gid.s5 += 5;
	gid.s6 += 6;
	gid.s7 += 7;
#ifdef VECTORS4
	gid.s1 += 1;
	gid.s2 += 2;
	gid.s3 += 3;
#elif defined(VECTORS3)
	gid.s1 += 1;
	gid.s2 += 2;
#elif defined(VECTORS2)
	gid.s1 += 1;
#else
	uint gid = get_global_id(0);
#endif

	volatile vo fixedWa[8] = {0x428a2f99,0xd807aa98,0xf59b89c2,0xb707775c,0xad87a3ea,0xc91b1417,0xe64fb6a2,0xe0a1adbe};
	const vo fixedWb[8] = {0xf1374491,0x12835b01,0x73924787,0x0468c23f,0xbcb1d3a3,0xc359dce1,0xe84d923a,0x7c728e11};
	volatile vo fixedWc[8] = {0xb5c0fbcf,0x243185be,0x23c6886e,0xe7e72b4c,0x7b993186,0xa83253a7,0xe93a5730,0x511c78e4};
	const vo fixedWd[8] = {0xe9b5dba5,0x550c7dc3,0xa42ca65c,0x49e1f1a2,0x562b9420,0x3b13c12d,0x09837686,0x315b45bd};
	volatile vo fixedWe[8] = {0x3956c25b,0x72be5d74,0x15ed3627,0x4b99c816,0xbff3ca0c,0x9d3d725d,0x078ff753,0xfca71413};
	const vo fixedWf[8] = {0x59f111f1,0x80deb1fe,0x4d6edcbf,0x926d1570,0xda4b0c23,0xd9031a84,0x29833341,0xea28f96a};
	volatile vo fixedWg[8] = {0x923f82a4,0x9bdc06a7,0xe28217fc,0xaa0fc072,0x6cd8711a,0xb1a03340,0xd5de0b7e,0x79703128};
	const vo fixedWh[8] = {0xab1c5ed5,0xc19bf794,0xef02488f,0xadb36e2c,0x8f337caa,0x16f58012,0x6948ccf4,0x4e1ef848};

	DecAllSK
	bool zero = 0;
	bool one = 1;
	vo lnum0 = 0; //0
	vo tstate00; //0
	vo tstate01; //0
	vo tstate02; //0
	vo tstate03; //0
	vo tstate04; //0
	vo tstate05; //0
	vo tstate06; //0
	vo tstate07; //0
	vo ostate00; //0
	vo ostate01; //0
	vo ostate02; //0
	vo ostate03; //0
	vo ostate04; //0
	vo ostate05; //0
	vo ostate06; //0
	vo ostate07; //0
	vo tstatebak00; //0
	vo tstatebak01; //0
	vo tstatebak02; //0
	vo tstatebak03; //0
	vo tstatebak04; //0
	vo tstatebak05; //0
	vo tstatebak06; //0
	vo tstatebak07; //0
	vo tmp00; //0
	vo tmp01; //0
	vo tmp02; //0
	vo tmp03; //0
	vo tmp04; //0
	vo tmp05; //0
	vo tmp06; //0
	vo tmp07; //0
	vo tmp08; //0
	vo tmp09; //0
	vo tmp10; //0
	vo tmp11; //0
	vo tmp12; //0
	vo tmp13; //0
	vo tmp14; //0
	vo tmp15; //0
	vo pad00 = midstate0.x;
	vo pad01 = midstate0.y;
	vo pad02 = midstate0.z;
	vo pad03 = midstate0.w;
	vo pad04 = midstate16.x;
	vo pad05 = midstate16.y;
	vo pad06 = midstate16.z;
	vo pad07 = midstate16.w;
	vo data00 = input[16];
	vo data01 = input[17];
	vo data02 = input[18];
	vo data03 = gid;
	vo data04 = SK02;
	vo data05 = 0; //0
	vo data06 = 0; //0
	vo data07 = 0; //0
	vo data08 = 0; //0
	vo data09 = 0; //0
	vo data10 = 0; //0
	vo data11 = 0; //0
	vo data12 = 0; //0
	vo data13 = 0; //0
	vo data14 = 0; //0
	vo data15 = SK03;

	SHA256(&pad00, &pad01, &pad02, &pad03, &pad04, &pad05, &pad06, &pad07, data00, data01, data02, data03,
		data04, data05, data06, data07, data08, data09, data10, data11, data12, data13, data14, data15, one);

	tmp00 = pad00^SK00;
	tmp01 = pad01^SK00;
	tmp02 = pad02^SK00;
	tmp03 = pad03^SK00;
	tmp04 = pad04^SK00;
	tmp05 = pad05^SK00;
	tmp06 = pad06^SK00;
	tmp07 = pad07^SK00;
	tmp08 = SK00;
	tmp09 = SK00;
	tmp10 = SK00;
	tmp11 = SK00;
	tmp12 = SK00;
	tmp13 = SK00;
	tmp14 = SK00;
	tmp15 = SK00;

	SHA256(&ostate00, &ostate01, &ostate02, &ostate03, &ostate04, &ostate05, &ostate06, &ostate07,
		tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp08, tmp09, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, zero);

	tmp00 = pad00^SK01;
	tmp01 = pad01^SK01;
	tmp02 = pad02^SK01;
	tmp03 = pad03^SK01;
	tmp04 = pad04^SK01;
	tmp05 = pad05^SK01;
	tmp06 = pad06^SK01;
	tmp07 = pad07^SK01;
	tmp08 = SK01;
	tmp09 = SK01;
	tmp10 = SK01;
	tmp11 = SK01;
	tmp12 = SK01;
	tmp13 = SK01;
	tmp14 = SK01;
	tmp15 = SK01;

	SHA256(&tstate00, &tstate01, &tstate02, &tstate03, &tstate04, &tstate05, &tstate06, &tstate07,
		tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp08, tmp09, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, zero);

	//backup tstate
	tstatebak00 = tstate00;
	tstatebak01 = tstate01;
	tstatebak02 = tstate02;
	tstatebak03 = tstate03;
	tstatebak04 = tstate04;
	tstatebak05 = tstate05;
	tstatebak06 = tstate06;
	tstatebak07 = tstate07;

    tmp00 = input[0];
    tmp01 = input[1];
    tmp02 = input[2];
    tmp03 = input[3];
    tmp04 = input[4];
    tmp05 = input[5];
    tmp06 = input[6];
    tmp07 = input[7];
    tmp08 = input[8];
    tmp09 = input[9];
    tmp10 = input[10];
    tmp11 = input[11];
    tmp12 = input[12];
    tmp13 = input[13];
    tmp14 = input[14];
    tmp15 = input[15];


	SHA256(&tstate00, &tstate01, &tstate02, &tstate03, &tstate04, &tstate05, &tstate06, &tstate07,
		tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp08, tmp09, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, one);

	for(uint i=0; i<4; i++){
		pad00 = tstate00;
		pad01 = tstate01;
		pad02 = tstate02;
		pad03 = tstate03;
		pad04 = tstate04;
		pad05 = tstate05;
		pad06 = tstate06;
		pad07 = tstate07;

		idx = (i<<3);

		X[idx] = ostate00;
		X[idx+1] = ostate01;
		X[idx+2] = ostate02;
		X[idx+3] = ostate03;
		X[idx+4] = ostate04;
		X[idx+5] = ostate05;
		X[idx+6] = ostate06;
		X[idx+7] = ostate07;

		lnum0++;
		SHA256(&pad00, &pad01, &pad02, &pad03, &pad04, &pad05, &pad06, &pad07, data00, data01, data02, data03,
			lnum0, SK02, zero, zero, zero, zero, zero, zero, zero, zero, zero, SK04, one);

		SHA256(&X[idx], &X[idx+1], &X[idx+2], &X[idx+3], &X[idx+4], &X[idx+5], &X[idx+6], &X[idx+7],
			pad00, pad01, pad02, pad03, pad04, pad05, pad06, pad07, SK02, zero, zero, zero, zero, zero, zero, SK05, one);

	}

	shittify(X);
#if !defined(VECTORS1)
	Transform(X);
#endif



#ifdef VECTORS16

#if (CLSIZE == 64)
	for(uint i=0; i<32; i+=2)
		scrypt_core(&X[i], &X[i+1], padcache, gid[i>>1]);
#elif (CLSIZE == 32)
    for(uint i=0; i<32; i+=2){
		line0 = (CLO)(X[i].s0, X[i].s1, X[i].s2, X[i].s3, X[i].s4, X[i].s5, X[i].s6, X[i].s7);
		line1 = (CLO)(X[i].s8, X[i].s9, X[i].sa, X[i].sb, X[i].sc, X[i].sd, X[i].se, X[i].sf);
		line2 = (CLO)(X[i+1].s0, X[i+1].s1, X[i+1].s2, X[i+1].s3, X[i+1].s4, X[i+1].s5, X[i+1].s6, X[i+1].s7);
		line3 = (CLO)(X[i+1].s8, X[i+1].s9, X[i+1].sa, X[i+1].sb, X[i+1].sc, X[i+1].sd, X[i+1].se, X[i+1].sf);
        scrypt_core(&line0, &line1, &line2, &line3, padcache, gid[i>>1]);
		X[i] = (vo)(line0.s0, line0.s1, line0.s2, line0.s3, line0.s4, line0.s5, line0.s6, line0.s7, line1.s0, line1.s1, line1.s2, line1.s3, line1.s4, line1.s5, line1.s6, line1.s7);
		X[i+1] = (vo)(line2.s0, line2.s1, line2.s2, line2.s3, line2.s4, line2.s5, line2.s6, line2.s7, line3.s0, line3.s1, line3.s2, line3.s3, line3.s4, line3.s5, line3.s6, line3.s7);
	}
#else
    for(uint i=0; i<32; i+=2){
		line0 = (CLO)(X[i].s0, X[i].s1, X[i].s2, X[i].s3);
		line1 = (CLO)(X[i].s4, X[i].s5, X[i].s6, X[i].s7);
		line2 = (CLO)(X[i].s8, X[i].s9, X[i].sa, X[i].sb);
		line3 = (CLO)(X[i].sc, X[i].sd, X[i].se, X[i].sf);
		line4 = (CLO)(X[i+1].s0, X[i+1].s1, X[i+1].s2, X[i+1].s3);
		line5 = (CLO)(X[i+1].s4, X[i+1].s5, X[i+1].s6, X[i+1].s7);
		line6 = (CLO)(X[i+1].s8, X[i+1].s9, X[i+1].sa, X[i+1].sb);
		line7 = (CLO)(X[i+1].sc, X[i+1].sd, X[i+1].se, X[i+1].sf);
        scrypt_core(&line0, &line1, &line2, &line3, &line4, &line5, &line6, &line7, padcache, gid[i>>1]);
		X[i] = (vo)(line0.s0, line0.s1, line0.s2, line0.s3, line1.s0, line1.s1, line1.s2, line1.s3, line2.s0, line2.s1, line2.s2, line2.s3, line3.s0, line3.s1, line3.s2, line3.s3);
		X[i+1] = (vo)(line4.s0, line4.s1, line4.s2, line4.s3, line5.s0, line5.s1, line5.s2, line5.s3, line6.s0, line6.s1, line6.s2, line6.s3, line7.s0, line7.s1, line7.s2, line7.s3);
	}
#endif

#elif defined(VECTORS8)

#if (CLSIZE == 64)

	for(uint i=0; i<32; i+=4){
		line0 = (CLO)(X[i].s0, X[i].s1, X[i].s2, X[i].s3, X[i].s4, X[i].s5, X[i].s6, X[i].s7, X[i+1].s0, X[i+1].s1, X[i+1].s2, X[i+1].s3, X[i+1].s4, X[i+1].s5, X[i+1].s6, X[i+1].s7);
		line1 = (CLO)(X[i+2].s0, X[i+2].s1, X[i+2].s2, X[i+2].s3, X[i+2].s4, X[i+2].s5, X[i+2].s6, X[i+2].s7, X[i+3].s0, X[i+3].s1, X[i+3].s2, X[i+3].s3, X[i+3].s4, X[i+3].s5, X[i+3].s6, X[i+3].s7);
		scrypt_core(&line0, &line1, padcache, gid[i>>2]);
		X[i] = (vo)(line0.s0, line0.s1, line0.s2, line0.s3, line0.s4, line0.s5, line0.s6, line0.s7);
		X[i+1] = (vo)(line0.s8, line0.s9, line0.sa, line0.sb, line0.sc, line0.sd, line0.se, line0.sf);
		X[i+2] = (vo)(line1.s0, line1.s1, line1.s2, line1.s3, line1.s4, line1.s5, line1.s6, line1.s7);
		X[i+3] = (vo)(line1.s8, line1.s9, line1.sa, line1.sb, line1.sc, line1.sd, line1.se, line1.sf);
	}
#elif (CLSIZE == 32)
	for(uint i=0; i<32; i+=4){
		scrypt_core(&X[i], &X[i+1], &X[i+2], &X[i+3], padcache, gid[i>>2]);
	}
#else
	for(uint i=0; i<32; i+=4){
		line0 = (CLO)(X[i].s0, X[i].s1, X[i].s2, X[i].s3);
		line1 = (CLO)(X[i].s4, X[i].s5, X[i].s6, X[i].s7);
		line2 = (CLO)(X[i+1].s0, X[i+1].s1, X[i+1].s2, X[i+1].s3);
		line3 = (CLO)(X[i+1].s4, X[i+1].s5, X[i+1].s6, X[i+1].s7);
		line4 = (CLO)(X[i+2].s0, X[i+2].s1, X[i+2].s2, X[i+2].s3);
		line5 = (CLO)(X[i+2].s4, X[i+2].s5, X[i+2].s6, X[i+2].s7);
		line6 = (CLO)(X[i+3].s0, X[i+3].s1, X[i+3].s2, X[i+3].s3);
		line7 = (CLO)(X[i+3].s4, X[i+3].s5, X[i+3].s6, X[i+3].s7);
		scrypt_core(&line0, &line1, &line2, &line3, &line4, &line5, &line6, &line7, padcache, gid[i>>2]);
		X[i] = (vo)(line0.s0, line0.s1, line0.s2, line0.s3, line1.s0, line1.s1, line1.s2, line1.s3);
		X[i+1] = (vo)(line2.s0, line2.s1, line2.s2, line2.s3, line3.s0, line3.s1, line3.s2, line3.s3);
		X[i+2] = (vo)(line4.s0, line4.s1, line4.s2, line4.s3, line5.s0, line5.s1, line5.s2, line5.s3);
		X[i+3] = (vo)(line6.s0, line6.s1, line6.s2, line6.s3, line7.s0, line7.s1, line7.s2, line7.s3);
	}
#endif

#elif defined(VECTORS4)

#if (CLSIZE == 64)
	for(uint i=0; i<32; i+=8){
		line0 = (CLO)(X[i].s0, X[i].s1, X[i].s2, X[i].s3, X[i+1].s0, X[i+1].s1, X[i+1].s2, X[i+1].s3, X[i+2].s0, X[i+2].s1, X[i+2].s2, X[i+2].s3, X[i+3].s0, X[i+3].s1, X[i+3].s2, X[i+3].s3);
		line1 = (CLO)(X[i+4].s0, X[i+4].s1, X[i+4].s2, X[i+4].s3, X[i+5].s0, X[i+5].s1, X[i+5].s2, X[i+5].s3, X[i+6].s0, X[i+6].s1, X[i+6].s2, X[i+6].s3, X[i+7].s0, X[i+7].s1, X[i+7].s2, X[i+7].s3);
		scrypt_core(&line0, &line1, padcache, gid[i>>3]);
		X[i] = (vo)(line0.s0, line0.s1, line0.s2, line0.s3);
		X[i+1] = (vo)(line0.s4, line0.s5, line0.s6, line0.s7);
		X[i+2] = (vo)(line0.s8, line0.s9, line0.sa, line0.sb);
		X[i+3] = (vo)(line0.sc, line0.sd, line0.se, line0.sf);
		X[i+4] = (vo)(line1.s0, line1.s1, line1.s2, line1.s3);
		X[i+5] = (vo)(line1.s4, line1.s5, line1.s6, line1.s7);
		X[i+6] = (vo)(line1.s8, line1.s9, line1.sa, line1.sb);
		X[i+7] = (vo)(line1.sc, line1.sd, line1.se, line1.sf);
	}

#elif (CLSIZE == 32)
	for(uint i=0; i<32; i+=8){
		line0 = (CLO)(X[i].s0, X[i].s1, X[i].s2, X[i].s3, X[i+1].s0, X[i+1].s1, X[i+1].s2, X[i+1].s3);
		line1 = (CLO)(X[i+2].s0, X[i+2].s1, X[i+2].s2, X[i+2].s3, X[i+3].s0, X[i+3].s1, X[i+3].s2, X[i+3].s3);
		line2 = (CLO)(X[i+4].s0, X[i+4].s1, X[i+4].s2, X[i+4].s3, X[i+5].s0, X[i+5].s1, X[i+5].s2, X[i+5].s3);
		line3 = (CLO)(X[i+6].s0, X[i+6].s1, X[i+6].s2, X[i+6].s3, X[i+7].s0, X[i+7].s1, X[i+7].s2, X[i+7].s3);
		scrypt_core(&line0, &line1, &line2, &line3, padcache, gid[i>>3]);
		X[i] = (vo)(line0.s0, line0.s1, line0.s2, line0.s3);
		X[i+1] = (vo)(line0.s4, line0.s5, line0.s6, line0.s7);
		X[i+2] = (vo)(line1.s0, line1.s1, line1.s2, line1.s3);
		X[i+3] = (vo)(line1.s4, line1.s5, line1.s6, line1.s7);
		X[i+4] = (vo)(line2.s0, line2.s1, line2.s2, line2.s3);
		X[i+5] = (vo)(line2.s4, line2.s5, line2.s6, line2.s7);
		X[i+6] = (vo)(line3.s0, line3.s1, line3.s2, line3.s3);
		X[i+7] = (vo)(line3.s4, line3.s5, line3.s6, line3.s7);
	}
#else
	for(uint i=0; i<32; i+=8){
		scrypt_core(&X[i], &X[i+1], &X[i+2], &X[i+3], &X[i+4], &X[i+5], &X[i+6], &X[i+7], padcache, gid[i>>3]);
	}
#endif



#elif defined(VECTORS2)

#if (CLSIZE == 64)
	for(uint i=0; i<32; i+=16){
		line0 = (CLO)(X[i].s0, X[i].s1, X[i+1].s0, X[i+1].s1, X[i+2].s0, X[i+2].s1, X[i+3].s0, X[i+3].s1, X[i+4].s0, X[i+4].s1, X[i+5].s0, X[i+5].s1, X[i+6].s0, X[i+6].s1, X[i+7].s0, X[i+7].s1);
		line1 = (CLO)(X[i+8].s0, X[i+8].s1, X[i+9].s0, X[i+9].s1, X[i+10].s0, X[i+10].s1, X[i+11].s0, X[i+11].s1, X[i+12].s0, X[i+12].s1, X[i+13].s0, X[i+13].s1, X[i+14].s0, X[i+14].s1, X[i+15].s0, X[i+15].s1);
		scrypt_core(&line0, &line1, padcache, gid[i>>4]);
		X[i] = (vo)(line0.s0, line0.s1);
		X[i+1] = (vo)(line0.s2, line0.s3);
		X[i+2] = (vo)(line0.s4, line0.s5);
		X[i+3] = (vo)(line0.s6, line0.s7);
		X[i+4] = (vo)(line0.s8, line0.s9);
		X[i+5] = (vo)(line0.sa, line0.sb);
		X[i+6] = (vo)(line0.sc, line0.sd);
		X[i+7] = (vo)(line0.se, line0.sf);
		X[i+8] = (vo)(line1.s0, line1.s1);
		X[i+9] = (vo)(line1.s2, line1.s3);
		X[i+10] = (vo)(line1.s4, line1.s5);
		X[i+11] = (vo)(line1.s6, line1.s7);
		X[i+12] = (vo)(line1.s8, line1.s9);
		X[i+13] = (vo)(line1.sa, line1.sb);
		X[i+14] = (vo)(line1.sc, line1.sd);
		X[i+15] = (vo)(line1.se, line1.sf);
	}
#elif (CLSIZE == 32)
	for(uint i=0; i<32; i+=16){
		line0 = (CLO)(X[i].s0, X[i].s1, X[i+1].s0, X[i+1].s1, X[i+2].s0, X[i+2].s1, X[i+3].s0, X[i+3].s1);
		line1 = (CLO)(X[i+4].s0, X[i+4].s1, X[i+5].s0, X[i+5].s1, X[i+6].s0, X[i+6].s1, X[i+7].s0, X[i+7].s1);
		line2 = (CLO)((X[i+8].s0, X[i+8].s1, X[i+9].s0, X[i+9].s1, X[i+10].s0, X[i+10].s1, X[i+11].s0, X[i+11].s1);
		line3 = (CLO)(X[i+12].s0, X[i+12].s1, X[i+13].s0, X[i+13].s1, X[i+14].s0, X[i+14].s1, X[i+15].s0, X[i+15].s1);
		scrypt_core(&line0, &line1, &line2, &line3, padcache, gid[i>>4]);
		X[i] = (vo)(line0.s0, line0.s1);
		X[i+1] = (vo)(line0.s2, line0.s3);
		X[i+2] = (vo)(line0.s4, line0.s5);
		X[i+3] = (vo)(line0.s6, line0.s7);
		X[i+4] = (vo)(line1.s0, line1.s1);
		X[i+5] = (vo)(line1.s2, line1.s3);
		X[i+6] = (vo)(line1.s4, line1.s5);
		X[i+7] = (vo)(line1.s6, line1.s7);
		X[i+8] = (vo)(line2.s0, line2.s1);
		X[i+9] = (vo)(line2.s2, line2.s3);
		X[i+10] = (vo)(line2.s4, line2.s5);
		X[i+11] = (vo)(line2.s6, line2.s7);
		X[i+12] = (vo)(line3.s0, line3.s1);
		X[i+13] = (vo)(line3.s2, line3.s3);
		X[i+14] = (vo)(line3.s4, line3.s5);
		X[i+15] = (vo)(line3.s6, line3.s7);
	}
#else
	for(uint i=0; i<32; i+=16){
		scrypt_core(&X[i], &X[i+1], &X[i+2], &X[i+3], &X[i+4], &X[i+5], &X[i+6], &X[i+7], &X[i+8], &X[i+9], &X[i+10], &X[i+11], &X[i+12], &X[i+13], &X[i+14], &X[i+15], padcache, gid[i>>4]);
	}
#endif

#else
	scrypt_core(X, padcache, gid);
#endif

#if !defined(VECTORS1)
	UnTransform(X);
#endif
	unshittify(X);

	idx = 0;
	SHA256(&tstatebak00, &tstatebak01, &tstatebak02, &tstatebak03, &tstatebak04, &tstatebak05, &tstatebak06, &tstatebak07,
		X[idx], X[idx+1], X[idx+2], X[idx+3], X[idx+4], X[idx+5], X[idx+6], X[idx+7],
		X[idx+8], X[idx+9], X[idx+10], X[idx+11], X[idx+12], X[idx+13], X[idx+14], X[idx+15], one);
	idx = 16;
	SHA256(&tstatebak00, &tstatebak01, &tstatebak02, &tstatebak03, &tstatebak04, &tstatebak05, &tstatebak06, &tstatebak07,
		X[idx], X[idx+1], X[idx+2], X[idx+3], X[idx+4], X[idx+5], X[idx+6], X[idx+7],
		X[idx+8], X[idx+9], X[idx+10], X[idx+11], X[idx+12], X[idx+13], X[idx+14], X[idx+15], one);

	tstate00 = tstatebak00;
	tstate01 = tstatebak01;
	tstate02 = tstatebak02;
	tstate03 = tstatebak03;
	tstate04 = tstatebak04;
	tstate05 = tstatebak05;
	tstate06 = tstatebak06;
	tstate07 = tstatebak07;

#define A tstate00
#define B tstate01
#define C tstate02
#define D tstate03
#define E tstate04
#define F tstate05
#define G tstate06
#define H tstate07
	for(uint i=0; i<8; i++){
		RND(A,B,C,D,E,F,G,H, fixedWa[i]);
		RND(H,A,B,C,D,E,F,G, fixedWb[i]);
		RND(G,H,A,B,C,D,E,F, fixedWc[i]);
		RND(F,G,H,A,B,C,D,E, fixedWd[i]);
		RND(E,F,G,H,A,B,C,D, fixedWe[i]);
		RND(D,E,F,G,H,A,B,C, fixedWf[i]);
		RND(C,D,E,F,G,H,A,B, fixedWg[i]);
		RND(B,C,D,E,F,G,H,A, fixedWh[i]);
	}
#undef A
#undef B
#undef C
#undef D
#undef E
#undef F
#undef G
#undef H

	tstatebak00 += tstate00;
	tstatebak01 += tstate01;
	tstatebak02 += tstate02;
	tstatebak03 += tstate03;
	tstatebak04 += tstate04;
	tstatebak05 += tstate05;
	tstatebak06 += tstate06;
	tstatebak07 += tstate07;

	SHA256(&ostate00, &ostate01, &ostate02, &ostate03, &ostate04, &ostate05, &ostate06, &ostate07,
		tstatebak00, tstatebak01, tstatebak02, tstatebak03, tstatebak04, tstatebak05, tstatebak06, tstatebak07,
		SK02, zero, zero, zero, zero, zero, zero, SK05, one);


#ifdef VECTORS16
	one = any(EndianSwapa(ostate07) <= target);
	if(one){
		if(EndianSwapa(ostate07.s0) <= target)
			SETFOUND(gid[0]);
		if(EndianSwapa(ostate07.s1) <= target)
			SETFOUND(gid[1]);
		if(EndianSwapa(ostate07.s2) <= target)
			SETFOUND(gid[2]);
		if(EndianSwapa(ostate07.s3) <= target)
			SETFOUND(gid[3]);
		if(EndianSwapa(ostate07.s4) <= target)
			SETFOUND(gid[4]);
		if(EndianSwapa(ostate07.s5) <= target)
			SETFOUND(gid[5]);
		if(EndianSwapa(ostate07.s6) <= target)
			SETFOUND(gid[6]);
		if(EndianSwapa(ostate07.s7) <= target)
			SETFOUND(gid[7]);
		if(EndianSwapa(ostate07.s8) <= target)
			SETFOUND(gid[8]);
		if(EndianSwapa(ostate07.s9) <= target)
			SETFOUND(gid[9]);
		if(EndianSwapa(ostate07.sa) <= target)
			SETFOUND(gid[10]);
		if(EndianSwapa(ostate07.sb) <= target)
			SETFOUND(gid[11]);
		if(EndianSwapa(ostate07.sc) <= target)
			SETFOUND(gid[12]);
		if(EndianSwapa(ostate07.sd) <= target)
			SETFOUND(gid[13]);
		if(EndianSwapa(ostate07.se) <= target)
			SETFOUND(gid[14]);
		if(EndianSwapa(ostate07.sf) <= target)
			SETFOUND(gid[15]);
	}
#elif defined(VECTORS8)
	one = any(EndianSwapa(ostate07) <= target);
	if(one){
		if(EndianSwapa(ostate07.s0) <= target)
			SETFOUND(gid[0]);
		if(EndianSwapa(ostate07.s1) <= target)
			SETFOUND(gid[1]);
		if(EndianSwapa(ostate07.s2) <= target)
			SETFOUND(gid[2]);
		if(EndianSwapa(ostate07.s3) <= target)
			SETFOUND(gid[3]);
		if(EndianSwapa(ostate07.s4) <= target)
			SETFOUND(gid[4]);
		if(EndianSwapa(ostate07.s5) <= target)
			SETFOUND(gid[5]);
		if(EndianSwapa(ostate07.s6) <= target)
			SETFOUND(gid[6]);
		if(EndianSwapa(ostate07.s7) <= target)
			SETFOUND(gid[7]);
	}
#elif defined(VECTORS4)
	one = any(EndianSwapa(ostate07) <= target);
	if(one){
		if(EndianSwapa(ostate07.s0) <= target)
			SETFOUND(gid[0]);
		if(EndianSwapa(ostate07.s1) <= target)
			SETFOUND(gid[1]);
		if(EndianSwapa(ostate07.s2) <= target)
			SETFOUND(gid[2]);
		if(EndianSwapa(ostate07.s3) <= target)
			SETFOUND(gid[3]);
	}
#elif defined(VECTORS3)
	one = any(EndianSwapa(ostate07) <= target);
	if(one){
		if(EndianSwapa(ostate07.s0) <= target)
			SETFOUND(gid[0]);
		if(EndianSwapa(ostate07.s1) <= target)
			SETFOUND(gid[1]);
		if(EndianSwapa(ostate07.s2) <= target)
			SETFOUND(gid[2]);
	}
#elif defined(VECTORS2)
	one = any(EndianSwapa(ostate07) <= target);
	if(one){
		if(EndianSwapa(ostate07.s0) <= target)
			SETFOUND(gid[0]);
		if(EndianSwapa(ostate07.s1) <= target)
			SETFOUND(gid[1]);
	}
#else
	one = (EndianSwapa(ostate07) <= target);
	if(one)
		SETFOUND(gid);
#endif
}
