#ifndef SCRYPT
#define SCRYPT 1

void halfsalsa(uint4 *w){
	for(uint i=0; i<4; ++i){
		w[0] ^= rotl(w[3]     +w[2]     , 7U);
		w[1] ^= rotl(w[0]     +w[3]     , 9U);
		w[2] ^= rotl(w[1]     +w[0]     ,13U);
		w[3] ^= rotl(w[2]     +w[1]     ,18U);
		w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
		w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
		w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
		w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
	}
}

#if (LOOKUP_GAP == 2)
void salsa(uint4 *B, bool db){
#else
void salsa(uint4 *B){
#endif
    uint4 w[4];

	for(uint i=0; i<4; ++i)
		w[i] = (B[i]^=B[i+4]);
	halfsalsa(w);
	for(uint i=0; i<4; ++i)
		w[i] = (B[i+4]^=(B[i]+=w[i]));
	halfsalsa(w);
#if (LOOKUP_GAP == 2)
	if(db){
		for(uint i=0; i<4; ++i)
			w[i] = (B[i]^=(B[i+4]+=w[i]));
		halfsalsa(w);
		for(uint i=0; i<4; ++i)
			w[i] = (B[i+4]^=(B[i]+=w[i]));
		halfsalsa(w);
	}
#endif
	for(uint i=0; i<4; ++i)
		B[i+4] += w[i];
}

void scrypt_core(volatile uint *XA, volatile uint *XB, volatile uint *XC, volatile uint *XD, volatile uint *XE, volatile uint *XF,
		volatile  uint *XG, volatile uint *XH, __global uint4 *restrict lookup){

	DEFNFACTOR(nfact)
	const uint zSIZE = 8;
	const uint xSIZE = CONCURRENT_THREADS;
	uint x = get_global_id(0)%xSIZE;
	uint4 X[8];


	uint4 tmpa = (uint4)(XE[0], XB[1], XG[1], XD[0]);
	uint4 tmpb = (uint4)(XA[1], XF[1], XC[0], XH[0]);
	uint4 tmpc = (uint4)(XE[1], XB[0], XG[0], XD[1]);
	uint4 tmpd = (uint4)(XA[0], XF[0], XC[1], XH[1]);
	X[0] = EndianSwapa(tmpa);
	X[1] = EndianSwapb(tmpb);
	X[2] = EndianSwapb(tmpc);
	X[3] = EndianSwapb(tmpd);
	tmpa = (uint4)(XE[2], XB[3], XG[3], XD[2]);
	tmpb = (uint4)(XA[3], XF[3], XC[2], XH[2]);
	tmpc = (uint4)(XE[3], XB[2], XG[2], XD[3]);
	tmpd = (uint4)(XA[2], XF[2], XC[3], XH[3]);
	X[4] = EndianSwapa(tmpa);
	X[5] = EndianSwapb(tmpb);
	X[6] = EndianSwapb(tmpc);
	X[7] = EndianSwapb(tmpd);

	for(uint y=0; y<(nfact/LOOKUP_GAP); ++y){
		for(uint z=0; z<zSIZE; ++z)
			lookup[CO] = X[z];

#if (LOOKUP_GAP == 2)
		salsa(X, 1);
#elif (LOOKUP_GAP == 1)
		salsa(X);
#else
		for(uint i=0; i<LOOKUP_GAP; ++i)
			salsa(X);
#endif
	}
#if (LOOKUP_GAP != 1) && (LOOKUP_GAP != 2) && (LOOKUP_GAP != 4) && (LOOKUP_GAP != 8)
        uint y = (nfact/LOOKUP_GAP);
        for(uint z=0; z<zSIZE; ++z)
            lookup[CO] = X[z];
        for(uint i=0; i<nfact%LOOKUP_GAP; ++i)
            salsa(X);
#endif

#if (LOOKUP_GAP != 1)
    for (uint i=0; i<nfact; ++i){
        uint j = X[7].x & (nfact-1);
#else
	for (uint i=0; i<nfact; ++i){
		uint y = X[7].x & (nfact-1);
#endif

#if (LOOKUP_GAP == 1)

#elif (LOOKUP_GAP == 2)
		uint y = (j>>1);
#elif (LOOKUP_GAP == 4)
		uint y = (j>>2);
#elif (LOOKUP_GAP == 8)
		uint y = (j>>3);
#else
		uint y = (j/LOOKUP_GAP);
#endif

#if (LOOKUP_GAP != 2) && (LOOKUP_GAP != 1)
		uint4 V[8];
		for(uint z=0; z<zSIZE; ++z)
			V[z] = lookup[CO];
#endif

#if (LOOKUP_GAP == 1)
		for(uint z=0; z<zSIZE; ++z)
			X[z] ^= lookup[CO];
#elif (LOOKUP_GAP == 2)
		if(j&1){
			uint4 V[8];
			for(uint z=0; z<zSIZE; ++z)
				V[z] = lookup[CO];
			salsa(V, 0);
			for(uint z=0; z<zSIZE; ++z)
				X[z] ^= V[z];
		}else{
			for(uint z=0; z<zSIZE; ++z)
				X[z] ^= lookup[CO];
		}
#else
		uint val = j%LOOKUP_GAP;
		for (uint z=0; z<val; ++z)
			salsa(V);
#endif

#if (LOOKUP_GAP != 2) && (LOOKUP_GAP != 1)
		for(uint z=0; z<zSIZE; ++z)
			X[z] ^= V[z];
#endif

#if (LOOKUP_GAP == 2)
        salsa(X, 0);
#else
        salsa(X);
#endif
    }

	tmpa = (uint4)(X[3].x,X[2].y,X[1].z,X[0].w);
	tmpb = (uint4)(X[0].x,X[3].y,X[2].z,X[1].w);
	tmpc = (uint4)(X[1].x,X[0].y,X[3].z,X[2].w);
	tmpd = (uint4)(X[2].x,X[1].y,X[0].z,X[3].w);
	tmpa = EndianSwapa(tmpa);
	XA[0] = tmpa.x;
	XB[0] = tmpa.y;
	XC[0] = tmpa.z;
	XD[0] = tmpa.w;
	tmpb = EndianSwapb(tmpb);
	XE[0] = tmpb.x;
	XF[0] = tmpb.y;
	XG[0] = tmpb.z;
	XH[0] = tmpb.w;
	tmpc = EndianSwapb(tmpc);
	XA[1] = tmpc.x;
	XB[1] = tmpc.y;
	XC[1] = tmpc.z;
	XD[1] = tmpc.w;
	tmpd = EndianSwapb(tmpd);
	XE[1] = tmpd.x;
	XF[1] = tmpd.y;
	XG[1] = tmpd.z;
	XH[1] = tmpd.w;
	tmpa = (uint4)(X[7].x,X[6].y,X[5].z,X[4].w);
	tmpb = (uint4)(X[4].x,X[7].y,X[6].z,X[5].w);
	tmpc = (uint4)(X[5].x,X[4].y,X[7].z,X[6].w);
	tmpd = (uint4)(X[6].x,X[5].y,X[4].z,X[7].w);
	tmpa = EndianSwapa(tmpa);
	XA[2] = tmpa.x;
	XB[2] = tmpa.y;
	XC[2] = tmpa.z;
	XD[2] = tmpa.w;
	tmpb = EndianSwapb(tmpb);
	XE[2] = tmpb.x;
	XF[2] = tmpb.y;
	XG[2] = tmpb.z;
	XH[2] = tmpb.w;
	tmpc = EndianSwapb(tmpc);
	XA[3] = tmpc.x;
	XB[3] = tmpc.y;
	XC[3] = tmpc.z;
	XD[3] = tmpc.w;
	tmpd = EndianSwapb(tmpd);
	XE[3] = tmpd.x;
	XF[3] = tmpd.y;
	XG[3] = tmpd.z;
	XH[3] = tmpd.w;
}

#endif