/*-
 * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt,
 * 2012-2013 Con Kolivas.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * This file was originally written by Colin Percival as part of the Tarsnap
 * online backup system.
 *
 * V1.3 modified by sterling pickens linuxsociety.org 2014
 */

#include "globals.cl"
void SHA256(uint4 *restrict state0,uint4 *restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3, bool notfresh){
//void SHA256(uint4 *state0, uint4 *state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3, bool notfresh){
	uint4 pass0 = *(state0);
	uint4 pass1 = *(state1);


	uint tmp0x = pass0.x;
	uint tmp0y = pass0.y;
	uint tmp0z = pass0.z;
	uint tmp0w = pass0.w;
	uint tmp1x = pass1.x;
	uint tmp1y = pass1.y;
	uint tmp1z = pass1.z;
	uint tmp1w = pass1.w;

	uint A = tmp0x;
	uint B = tmp0y;
	uint C = tmp0z;
	uint D = tmp0w;
	uint E = tmp1x;
	uint F = tmp1y;
	uint G = tmp1z;
	uint H = tmp1w;

	uint Wx = block0.x;
	uint Wy = block0.y;
	uint Wz = block0.z;
	uint Ww = block0.w;

	uint Xx = block1.x;
	uint Xy = block1.y;
	uint Xz = block1.z;
	uint Xw = block1.w;

	uint Yx = block2.x;
	uint Yy = block2.y;
	uint Yz = block2.z;
	uint Yw = block2.w;

	uint Zx = block3.x;
	uint Zy = block3.y;
	uint Zz = block3.z;
	uint Zw = block3.w;

	DecAllKA
	DecAllKE
    DecAllKB
    DecAllKC
    DecAllKD

	if(notfresh){
		RND(A,B,C,D,E,F,G,H,K00+Wx)
		RND(H,A,B,C,D,E,F,G,K01+Wy)
		RND(G,H,A,B,C,D,E,F,K02+Wz)
		RND(F,G,H,A,B,C,D,E,K03+Ww)
	}else{
		D= K63 +Wx;
		H= K64 +Wx;
		C= K65 +Tr1(D)+Ch(D, K66, K67)+Wy;
		G= K68 +C+Tr2(H)+Ch(H, K69 ,K70);
		tmp1x = K66;
		tmp1y = K67;
		B= K71 +Tr1(C)+Ch(C,D,K66)+Wz;
		F= K72 +B+Tr2(G)+Maj(G,H,K73);
		A= K74 +Tr1(B)+Ch(B,C,D)+Ww;
		E= K75 +A+Tr2(F)+Maj(F,G,H);
		tmp0x = K73;
		tmp0y = K77;
		tmp0z = K78;
		tmp0w = K79;
		tmp1z = K80;
		tmp1w = K81;
	}

	RND(E,F,G,H,A,B,C,D,K04+Xx)
	RND(D,E,F,G,H,A,B,C,K05+Xy)
	RND(C,D,E,F,G,H,A,B,K06+Xz)
	RND(B,C,D,E,F,G,H,A,K07+Xw)

	RND(A,B,C,D,E,F,G,H,K08+Yx)
	RND(H,A,B,C,D,E,F,G,K09+Yy)
	RND(G,H,A,B,C,D,E,F,K10+Yz)
	RND(F,G,H,A,B,C,D,E,K11+Yw)

	RND(E,F,G,H,A,B,C,D,K12+Zx)
	RND(D,E,F,G,H,A,B,C,K13+Zy)
	RND(C,D,E,F,G,H,A,B,K14+Zz)
	RND(B,C,D,E,F,G,H,A,K76+Zw)

	Wx += Wr1(Zz) + Yy + Wr2(Wy);
	RND(A,B,C,D,E,F,G,H, Wx+ K15)
	Wy += Wr1(Zw) + Yz + Wr2(Wz);
	RND(H,A,B,C,D,E,F,G, Wy+ K16)
	Wz += Wr1(Wx) + Yw + Wr2(Ww);
	RND(G,H,A,B,C,D,E,F, Wz+ K17)
	Ww += Wr1(Wy) + Zx + Wr2(Xx);
	RND(F,G,H,A,B,C,D,E, Ww+ K18)

	Xx += Wr1(Wz) + Zy + Wr2(Xy);
	RND(E,F,G,H,A,B,C,D, Xx+ K19)
	Xy += Wr1(Ww) + Zz + Wr2(Xz);
	RND(D,E,F,G,H,A,B,C, Xy+ K20)
	Xz += Wr1(Xx) + Zw + Wr2(Xw);
	RND(C,D,E,F,G,H,A,B, Xz+ K21)
	Xw += Wr1(Xy) + Wx + Wr2(Yx);
	RND(B,C,D,E,F,G,H,A, Xw+ K22)

	Yx += Wr1(Xz) + Wy + Wr2(Yy);
	RND(A,B,C,D,E,F,G,H, Yx+ K23)
	Yy += Wr1(Xw) + Wz + Wr2(Yz);
	RND(H,A,B,C,D,E,F,G, Yy+ K24)
	Yz += Wr1(Yx) + Ww + Wr2(Yw);
	RND(G,H,A,B,C,D,E,F, Yz+ K25)
	Yw += Wr1(Yy) + Xx + Wr2(Zx);
	RND(F,G,H,A,B,C,D,E, Yw+ K26)

	Zx += Wr1(Yz) + Xy + Wr2(Zy);
	RND(E,F,G,H,A,B,C,D, Zx+ K27)
	Zy += Wr1(Yw) + Xz + Wr2(Zz);
	RND(D,E,F,G,H,A,B,C, Zy+ K28)
	Zz += Wr1(Zx) + Xw + Wr2(Zw);
	RND(C,D,E,F,G,H,A,B, Zz+ K29)
	Zw += Wr1(Zy) + Yx + Wr2(Wx);
	RND(B,C,D,E,F,G,H,A, Zw+ K30)

	Wx += Wr1(Zz) + Yy + Wr2(Wy);
	RND(A,B,C,D,E,F,G,H, Wx+ K31)
	Wy += Wr1(Zw) + Yz + Wr2(Wz);
	RND(H,A,B,C,D,E,F,G, Wy+ K32)
	Wz += Wr1(Wx) + Yw + Wr2(Ww);
	RND(G,H,A,B,C,D,E,F, Wz+ K33)
	Ww += Wr1(Wy) + Zx + Wr2(Xx);
	RND(F,G,H,A,B,C,D,E, Ww+ K34)

	Xx += Wr1(Wz) + Zy + Wr2(Xy);
	RND(E,F,G,H,A,B,C,D, Xx+ K35)
	Xy += Wr1(Ww) + Zz + Wr2(Xz);
	RND(D,E,F,G,H,A,B,C, Xy+ K36)
	Xz += Wr1(Xx) + Zw + Wr2(Xw);
	RND(C,D,E,F,G,H,A,B, Xz+ K37)
	Xw += Wr1(Xy) + Wx + Wr2(Yx);
	RND(B,C,D,E,F,G,H,A, Xw+ K38)

	Yx += Wr1(Xz) + Wy + Wr2(Yy);
	RND(A,B,C,D,E,F,G,H, Yx+ K39)
	Yy += Wr1(Xw) + Wz + Wr2(Yz);
	RND(H,A,B,C,D,E,F,G, Yy+ K40)
	Yz += Wr1(Yx) + Ww + Wr2(Yw);
	RND(G,H,A,B,C,D,E,F, Yz+ K41)
	Yw += Wr1(Yy) + Xx + Wr2(Zx);
	RND(F,G,H,A,B,C,D,E, Yw+ K42)

	Zx += Wr1(Yz) + Xy + Wr2(Zy);
	RND(E,F,G,H,A,B,C,D, Zx+ K43)
	Zy += Wr1(Yw) + Xz + Wr2(Zz);
	RND(D,E,F,G,H,A,B,C, Zy+ K44)
	Zz += Wr1(Zx) + Xw + Wr2(Zw);
	RND(C,D,E,F,G,H,A,B, Zz+ K45)
	Zw += Wr1(Zy) + Yx + Wr2(Wx);
	RND(B,C,D,E,F,G,H,A, Zw+ K46)

	Wx += Wr1(Zz) + Yy + Wr2(Wy);
	RND(A,B,C,D,E,F,G,H, Wx+ K47)
	Wy += Wr1(Zw) + Yz + Wr2(Wz);
	RND(H,A,B,C,D,E,F,G, Wy+ K48)
	Wz += Wr1(Wx) + Yw + Wr2(Ww);
	RND(G,H,A,B,C,D,E,F, Wz+ K49)
	Ww += Wr1(Wy) + Zx + Wr2(Xx);
	RND(F,G,H,A,B,C,D,E, Ww+ K50)

	Xx += Wr1(Wz) + Zy + Wr2(Xy);
	RND(E,F,G,H,A,B,C,D, Xx+ K51)
	Xy += Wr1(Ww) + Zz + Wr2(Xz);
	RND(D,E,F,G,H,A,B,C, Xy+ K52)
	Xz += Wr1(Xx) + Zw + Wr2(Xw);
	RND(C,D,E,F,G,H,A,B, Xz+ K53)
	Xw += Wr1(Xy) + Wx + Wr2(Yx);
	RND(B,C,D,E,F,G,H,A, Xw+ K54)

	Yx += Wr1(Xz) + Wy + Wr2(Yy);
	RND(A,B,C,D,E,F,G,H, Yx+ K55)
	Yy += Wr1(Xw) + Wz + Wr2(Yz);
	RND(H,A,B,C,D,E,F,G, Yy+ K56)
	Yz += Wr1(Yx) + Ww + Wr2(Yw);
	RND(G,H,A,B,C,D,E,F, Yz+ K57)
	Yw += Wr1(Yy) + Xx + Wr2(Zx);
	RND(F,G,H,A,B,C,D,E, Yw+ K58)

	Zx += Wr1(Yz) + Xy + Wr2(Zy);
	RND(E,F,G,H,A,B,C,D, Zx+ K59)
	Zy += Wr1(Yw) + Xz + Wr2(Zz);
	RND(D,E,F,G,H,A,B,C, Zy+ K60)
	Zz += Wr1(Zx) + Xw + Wr2(Zw);
	RND(C,D,E,F,G,H,A,B, Zz+ K61)
	Zw += Wr1(Zy) + Yx + Wr2(Wx);
	RND(B,C,D,E,F,G,H,A, Zw+ K62)


	tmp0x += A;
	tmp0y += B;
	tmp0z += C;
	tmp0w += D;
	tmp1x += E;
	tmp1y += F;
	tmp1z += G;
	tmp1w += H;

	//FNIadd(tmp0, S0)
	//FNIadd(tmp1, S1)
	//FNIcpyca(state0, tmp0)
	//FNIcpyca(state1, tmp1)

	pass0.x = tmp0x;
	pass0.y = tmp0y;
	pass0.z = tmp0z;
	pass0.w = tmp0w;

	pass1.x = tmp1x;
	pass1.y = tmp1y;
	pass1.z = tmp1z;
	pass1.w = tmp1w;


	*state0 = pass0;
	*state1 = pass1;

}


void halfsalsa(uint4 *w){
	for(uint i=0; i<4; ++i){
		w[0] ^= rotl(w[3]     +w[2]     , 7U);
		w[1] ^= rotl(w[0]     +w[3]     , 9U);
		w[2] ^= rotl(w[1]     +w[0]     ,13U);
		w[3] ^= rotl(w[2]     +w[1]     ,18U);
		w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
		w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
		w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
		w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
	}
}

#if (LOOKUP_GAP == 2)
void salsa(uint4 *B, bool db){
#else
void salsa(uint4 *B){
#endif
    uint4 w[4];

	for(uint i=0; i<4; ++i)
		w[i] = (B[i]^=B[i+4]);
	halfsalsa(w);
	for(uint i=0; i<4; ++i)
		w[i] = (B[i+4]^=(B[i]+=w[i]));
	halfsalsa(w);
#if (LOOKUP_GAP == 2)
	if(db){
		for(uint i=0; i<4; ++i)
			w[i] = (B[i]^=(B[i+4]+=w[i]));
		halfsalsa(w);
		for(uint i=0; i<4; ++i)
			w[i] = (B[i+4]^=(B[i]+=w[i]));
		halfsalsa(w);
	}
#endif
	for(uint i=0; i<4; ++i)
		B[i+4] += w[i];
}

void scrypt_core(uint4 *X, __global uint4 *restrict lookup){
	DEFNFACTOR(nfact)
	//const uint zSIZE = 8;
	//uint xSIZE = CONCURRENT_THREADS;
	//uint x = (get_global_id(0)%xSIZE);
	//xSIZE *= 8;
    const uint zSIZE = 8;
    const uint ySIZE = (nfact/LOOKUP_GAP+(nfact%LOOKUP_GAP>0));
    const uint xSIZE = CONCURRENT_THREADS;
    uint x = get_global_id(0)%xSIZE;


	uint4 tmpa = (uint4)(X[1].x,X[2].y,X[3].z,X[0].w);
	uint4 tmpb = (uint4)(X[2].x,X[3].y,X[0].z,X[1].w);
	uint4 tmpc = (uint4)(X[3].x,X[0].y,X[1].z,X[2].w);
	uint4 tmpd = (uint4)(X[0].x,X[1].y,X[2].z,X[3].w);

	X[0] = EndianSwapa(tmpa);
	X[1] = EndianSwapb(tmpb);
	X[2] = EndianSwapb(tmpc);
	X[3] = EndianSwapb(tmpd);

	tmpa = (uint4)(X[5].x,X[6].y,X[7].z,X[4].w);
	tmpb = (uint4)(X[6].x,X[7].y,X[4].z,X[5].w);
	tmpc = (uint4)(X[7].x,X[4].y,X[5].z,X[6].w);
	tmpd = (uint4)(X[4].x,X[5].y,X[6].z,X[7].w);

	X[4] = EndianSwapa(tmpa);
	X[5] = EndianSwapb(tmpb);
	X[6] = EndianSwapb(tmpc);
	X[7] = EndianSwapb(tmpd);

	for(uint y=0; y<(nfact/LOOKUP_GAP); ++y){
		for(uint z=0; z<zSIZE; ++z)
			lookup[IDX] = X[z];
		
#if (LOOKUP_GAP == 2)
		salsa(X, 1);
#elif (LOOKUP_GAP == 1)
		salsa(X);
#else
		for(uint i=0; i<LOOKUP_GAP; ++i)
			salsa(X);
#endif
	}
#if (LOOKUP_GAP != 1) && (LOOKUP_GAP != 2) && (LOOKUP_GAP != 4) && (LOOKUP_GAP != 8)
	uint y = (nfact/LOOKUP_GAP);
	for(uint z=0; z<zSIZE; ++z)
		lookup[IDX] = X[z];
	for(uint i=0; i<(nfact%LOOKUP_GAP); ++i)
		salsa(X);
#endif

#if (LOOKUP_GAP != 1)
    for (uint i=0; i<nfact; ++i){
        uint j = X[7].x & (nfact-1);
#else
	for (uint i=0; i<nfact; ++i){
		uint y = X[7].x & (nfact-1);
#endif

#if (LOOKUP_GAP == 1)
#elif (LOOKUP_GAP == 2)
		uint y = (j>>1);
#elif (LOOKUP_GAP == 4)
		uint y = (j>>2);
#elif (LOOKUP_GAP == 8)
		uint y = (j>>3);
#else
		uint y = (j/LOOKUP_GAP);
#endif

#if (LOOKUP_GAP != 2) && (LOOKUP_GAP != 1)
		uint4 V[8];
		for(uint z=0; z<zSIZE; ++z)
			V[z] = lookup[IDX];
#endif

#if (LOOKUP_GAP == 1)
		for(uint z=0; z<zSIZE; ++z)
			X[z] ^= lookup[IDX];
#elif (LOOKUP_GAP == 2)
		if(j&1){
			uint4 V[8];
			for(uint z=0; z<zSIZE; ++z)
				V[z] = lookup[IDX];
			salsa(V, 0);
			for(uint z=0; z<zSIZE; ++z)
				X[z] ^= V[z];
		}else{
			for(uint z=0; z<zSIZE; ++z)
				X[z] ^= lookup[IDX];
		}
#else
		for(uint z=0; z<zSIZE; ++z)
			X[z] ^= V[z];
#endif

#if (LOOKUP_GAP != 2) && (LOOKUP_GAP != 1)
#endif

#if (LOOKUP_GAP == 2)
        salsa(X, 0);
#elif (LOOKUP_GAP == 1)
        salsa(X);
#else
		uint val = j%LOOKUP_GAP;
		for (uint z=0; z<val; ++z)
			salsa(V);
#endif
    }

	tmpa = (uint4)(X[3].x,X[2].y,X[1].z,X[0].w);
	tmpb = (uint4)(X[0].x,X[3].y,X[2].z,X[1].w);
	tmpc = (uint4)(X[1].x,X[0].y,X[3].z,X[2].w);
	tmpd = (uint4)(X[2].x,X[1].y,X[0].z,X[3].w);

	X[0] = EndianSwapa(tmpa);
	X[1] = EndianSwapb(tmpb);
	X[2] = EndianSwapb(tmpc);
	X[3] = EndianSwapb(tmpd);

	tmpa = (uint4)(X[7].x,X[6].y,X[5].z,X[4].w);
	tmpb = (uint4)(X[4].x,X[7].y,X[6].z,X[5].w);
	tmpc = (uint4)(X[5].x,X[4].y,X[7].z,X[6].w);
	tmpd = (uint4)(X[6].x,X[5].y,X[4].z,X[7].w);

	X[4] = EndianSwapa(tmpa);
	X[5] = EndianSwapb(tmpb);
	X[6] = EndianSwapb(tmpc);
	X[7] = EndianSwapb(tmpd);
}



__constant uint fixedWa[8] = {0x428a2f99,0xd807aa98,0xf59b89c2,0xb707775c,0xad87a3ea,0xc91b1417,0xe64fb6a2,0xe0a1adbe};
__constant uint fixedWb[8] = {0xf1374491,0x12835b01,0x73924787,0x0468c23f,0xbcb1d3a3,0xc359dce1,0xe84d923a,0x7c728e11};
__constant uint fixedWc[8] = {0xb5c0fbcf,0x243185be,0x23c6886e,0xe7e72b4c,0x7b993186,0xa83253a7,0xe93a5730,0x511c78e4};
__constant uint fixedWd[8] = {0xe9b5dba5,0x550c7dc3,0xa42ca65c,0x49e1f1a2,0x562b9420,0x3b13c12d,0x09837686,0x315b45bd};
__constant uint fixedWe[8] = {0x3956c25b,0x72be5d74,0x15ed3627,0x4b99c816,0xbff3ca0c,0x9d3d725d,0x078ff753,0xfca71413};
__constant uint fixedWf[8] = {0x59f111f1,0x80deb1fe,0x4d6edcbf,0x926d1570,0xda4b0c23,0xd9031a84,0x29833341,0xea28f96a};
__constant uint fixedWg[8] = {0x923f82a4,0x9bdc06a7,0xe28217fc,0xaa0fc072,0x6cd8711a,0xb1a03340,0xd5de0b7e,0x79703128};
__constant uint fixedWh[8] = {0xab1c5ed5,0xc19bf794,0xef02488f,0xadb36e2c,0x8f337caa,0x16f58012,0x6948ccf4,0x4e1ef848};


__constant uint sK[6] = {
    0x5C5C5C5CU, //82
    0x36363636U,
    0x80000000U,
//  0x000003FFU, //never used
    0x00000280U,
    0x000004a0U,
    0x00000300U
};




__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(__global const uint4 *restrict input,
volatile __global uint *restrict output, __global uint4 *restrict padcache, const uint4 midstate0, const uint4 midstate16, const uint target){
	uint gid = get_global_id(0);
	uint4 X[8];
	uint4 tmp0 = 0;
	uint4 tmp1 = 0;
	uint4 pass0 = 0;
	uint4 pass1 = 0;
	uint4 pass2 = 0;
	uint tstate0x = 0;
	uint tstate0y = 0;
	uint tstate0z = 0;
	uint tstate0w = 0;
	uint tstate1x = 0;
	uint tstate1y = 0;
	uint tstate1z = 0;
	uint tstate1w = 0;
	uint ostate0x = 0;
	uint ostate0y = 0;
	uint ostate0z = 0;
	uint ostate0w = 0;
	uint ostate1x = 0;
	uint ostate1y = 0;
	uint ostate1z = 0;
	uint ostate1w = 0;
	uint datax = input[4].x;
	uint datay = input[4].y;
	uint dataz = input[4].z;
	uint dataw = gid;
	uint pad0x = midstate0.x;
	uint pad0y = midstate0.y;
	uint pad0z = midstate0.z;
	uint pad0w = midstate0.w;
	uint pad1x = midstate16.x;
	uint pad1y = midstate16.y;
	uint pad1z = midstate16.z;
	uint pad1w = midstate16.w;
	DecAllSK

	pass0.x = pad0x;
	pass0.y = pad0y;
	pass0.z = pad0z;
	pass0.w = pad0w;

	pass1.x = pad1x;
	pass1.y = pad1y;
	pass1.z = pad1z;
	pass1.w = pad1w;

	pass2.x = datax;
	pass2.y = datay;
	pass2.z = dataz;
	pass2.w = dataw;

	SHA256(&pass0,&pass1, pass2, SK00, SK01, SK02, 1);

	pad0x = pass0.x;
	pad0y = pass0.y;
	pad0z = pass0.z;
	pad0w = pass0.w;

	pad1x = pass1.x;
	pad1y = pass1.y;
	pad1z = pass1.z;
	pad1w = pass1.w;

	tmp0 = pass0^SK03;
	tmp1 = pass1^SK03;

	pass0.x = ostate0x;
	pass0.y = ostate0y;
	pass0.z = ostate0z;
	pass0.w = ostate0w;

	pass1.x = ostate1x;
	pass1.y = ostate1y;
	pass1.z = ostate1z;
	pass1.w = ostate1w;

	SHA256(&pass0, &pass1, tmp0, tmp1, SK03, SK03, 0);

	ostate0x = pass0.x;
	ostate0y = pass0.y;
	ostate0z = pass0.z;
	ostate0w = pass0.w;

	ostate1x = pass1.x;
	ostate1y = pass1.y;
	ostate1z = pass1.z;
	ostate1w = pass1.w;

	pass0.x = pad0x;
	pass0.y = pad0y;
	pass0.z = pad0z;
	pass0.w = pad0w;

	pass1.x = pad1x;
	pass1.y = pad1y;
	pass1.z = pad1z;
	pass1.w = pad1w;

	tmp0 = pass0^SK04;
	tmp1 = pass1^SK04;

	pass0.x = tstate0x;
	pass0.y = tstate0y;
	pass0.z = tstate0z;
	pass0.w = tstate0w;

	pass1.x = tstate1x;
	pass1.y = tstate1y;
	pass1.z = tstate1z;
	pass1.w = tstate1w;

	SHA256(&pass0, &pass1, tmp0, tmp1, SK04, SK04, 0);

	tstate0x = pass0.x;
	tstate0y = pass0.y;
	tstate0z = pass0.z;
	tstate0w = pass0.w;
	tstate1x = pass1.x;
	tstate1y = pass1.y;
	tstate1z = pass1.z;
	tstate1w = pass1.w;

	tmp0 = pass0;
	tmp1 = pass1;

	SHA256(&pass0, &pass1, input[0],input[1],input[2],input[3], 1);

	
	//FOURfromvec(tstate0, pass0);
	//FOURfromvec(tstate1, pass1);
	tstate0x = pass0.x;
	tstate0y = pass0.y;
	tstate0z = pass0.z;
	tstate0w = pass0.w;
	tstate1x = pass1.x;
	tstate1y = pass1.y;
	tstate1z = pass1.z;
	tstate1w = pass1.w;

        pass2.x = datax;
        pass2.y = datay;
        pass2.z = dataz;
        pass2.w = dataw;



	for (uint i=0; i<4; i++){
		//FOURcopy(pad0, tstate0);
		//FOURcopy(pad1, tstate1);
		//pad0 = tstate0;
		//pad0x = tstate0x;
		//pad0y = tstate0y;
		//pad0z = tstate0z;
		//pad0w = tstate0w;		
		//pad1x = tstate1x;
		//pad1y = tstate1y;
		//pad1z = tstate1z;
		//pad1w = tstate1w;

		pass0.x = ostate0x;
		pass0.y = ostate0y;
		pass0.z = ostate0z;
		pass0.w = ostate0w;
		pass1.x = ostate1x;
		pass1.y = ostate1y;
		pass1.z = ostate1z;
		pass1.w = ostate1w;

		//FOURtovec(pass0, ostate0);
		//FOURtovec(pass1, ostate1);
		X[(i<<1) ] = pass0;
		X[(i<<1)+1] = pass1;

		pass0.x = tstate0x;
		pass0.y = tstate0y;
		pass0.z = tstate0z;
		pass0.w = tstate0w;
		pass1.x = tstate1x;
		pass1.y = tstate1y;
		pass1.z = tstate1z;
		pass1.w = tstate1w;
		//pass2.x = datax;
		//pass2.y = datay;
		//pass2.z = dataz;
		//pass2.w = datax;
		//FOURtovec(pass0, pad0);
		//FOURtovec(pass1, pad1);
		//FOURtovec(pass2, data);
		//SK05.x++;

		//tstate data
		SHA256(&pass0, &pass1, pass2, SK05, SK01, SK06, 1);
		//SHA256(&pass0, &pass1, pass2,(uint4)(i+1,sK[2],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, sK[4]), 1);
		//pad0x = pass0.x;
		//pad0y = pass0.y;
		//pad0z = pass0.z;
		//pad0w = pass0.w;
		//pad1x = pass1.x;
		//pad1y = pass1.y;
		//pad1z = pass1.z;
		//pad1w = pass1.w;
		//FOURfromvec(pad0, pass0);
		//FOURfromvec(pad1, pass1);
		//SHA256(X+(i<<1),X+(i<<1)+1,pass0, pass1,(uint4)(sK[2], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, sK[5]), 1);
		SHA256(X+(i<<1),X+(i<<1)+1, pass0, pass1, SK00, SK07, 1);
	}
	scrypt_core(X,padcache);

	SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3], 1);
	SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7], 1);

	tstate0x = tmp0.x;
	tstate0y = tmp0.y;
	tstate0z = tmp0.z;
	tstate0w = tmp0.w;

	tstate1x = tmp1.x;
	tstate1y = tmp1.y;
	tstate1z = tmp1.z;
	tstate1w = tmp1.w;

	//FOURfromvec(tstate0, tmp0);
	//FOURfromvec(tstate1, tmp1);

#define A tstate0x
#define B tstate0y
#define C tstate0z
#define D tstate0w
#define E tstate1x
#define F tstate1y
#define G tstate1z
#define H tstate1w
	for(uint i=0; i<8; i++){
		RND(A,B,C,D,E,F,G,H, fixedWa[i]);
		RND(H,A,B,C,D,E,F,G, fixedWb[i]);
		RND(G,H,A,B,C,D,E,F, fixedWc[i]);
		RND(F,G,H,A,B,C,D,E, fixedWd[i]);
		RND(E,F,G,H,A,B,C,D, fixedWe[i]);
		RND(D,E,F,G,H,A,B,C, fixedWf[i]);
		RND(C,D,E,F,G,H,A,B, fixedWg[i]);
		RND(B,C,D,E,F,G,H,A, fixedWh[i]);
	}
#undef A
#undef B
#undef C
#undef D
#undef E
#undef F
#undef G
#undef H

	//FOURtovec(pass0, tstate0);
	//FOURtovec(pass1, tstate1);

	pass0.x = tstate0x;
	pass0.y = tstate0y;
	pass0.z = tstate0z;
	pass0.w = tstate0w;
	pass1.x = tstate1x;
	pass1.y = tstate1y;
	pass1.z = tstate1z;
	pass1.w = tstate1w;

	tmp0 += pass0;
	tmp1 += pass1;


	pass2.x = ostate0x;
	pass2.y = ostate0y;
	pass2.z = ostate0z;
	pass2.w = ostate0w;

	pass0.x = ostate1x;
	pass0.y = ostate1y;
	pass0.z = ostate1z;
	pass0.w = ostate1w;


	//FOURtovec(pass2, ostate0);
	//FOURtovec(pass0, ostate1);
	SHA256(&pass2,&pass0, tmp0, tmp1, SK00, SK07, 1);

	ostate1w = pass0.w;
	//FOURfromvec(ostate1, pass0);

	bool result = (EndianSwapa(ostate1w) <= target);
	if (result)
		SETFOUND(gid);
}








