#ifndef SCRYPT
#define SCRYPT 1

void halfsalsa(uint4 *w){
	for(uint i=0; i<4; ++i){
		w[0] ^= rotl(w[3]     +w[2]     , 7U);
		w[1] ^= rotl(w[0]     +w[3]     , 9U);
		w[2] ^= rotl(w[1]     +w[0]     ,13U);
		w[3] ^= rotl(w[2]     +w[1]     ,18U);
		w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
		w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
		w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
		w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
	}
}

#if (LOOKUP_GAP == 2)
void salsa(uint4 *B, bool db){
#else
void salsa(uint4 *B){
#endif
    uint4 w[4];

	for(uint i=0; i<4; ++i)
		w[i] = (B[i]^=B[i+4]);
	halfsalsa(w);
	for(uint i=0; i<4; ++i)
		w[i] = (B[i+4]^=(B[i]+=w[i]));
	halfsalsa(w);
#if (LOOKUP_GAP == 2)
	if(db){
		for(uint i=0; i<4; ++i)
			w[i] = (B[i]^=(B[i+4]+=w[i]));
		halfsalsa(w);
		for(uint i=0; i<4; ++i)
			w[i] = (B[i+4]^=(B[i]+=w[i]));
		halfsalsa(w);
	}
#endif
	for(uint i=0; i<4; ++i)
		B[i+4] += w[i];
}

void scrypt_core(uint4 *X, __global uint4 *restrict lookup){
	DEFNFACTOR(nfact)
	const uint zSIZE = 8;
	//const uint ySIZE = (nfact/LOOKUP_GAP+(nfact%LOOKUP_GAP>0));
	const uint xSIZE = CONCURRENT_THREADS;
	uint x = get_global_id(0)%xSIZE;
	uint4 tmp[4];

	tmp[0] = (uint4)(X[1].x,X[2].y,X[3].z,X[0].w);
	tmp[1] = (uint4)(X[2].x,X[3].y,X[0].z,X[1].w);
	tmp[2] = (uint4)(X[3].x,X[0].y,X[1].z,X[2].w);
	tmp[3] = (uint4)(X[0].x,X[1].y,X[2].z,X[3].w);

	X[0] = EndianSwapa(tmp[0]);
	X[1] = EndianSwapb(tmp[1]);
	X[2] = EndianSwapb(tmp[2]);
	X[3] = EndianSwapb(tmp[3]);

	tmp[0] = (uint4)(X[5].x,X[6].y,X[7].z,X[4].w);
	tmp[1] = (uint4)(X[6].x,X[7].y,X[4].z,X[5].w);
	tmp[2] = (uint4)(X[7].x,X[4].y,X[5].z,X[6].w);
	tmp[3] = (uint4)(X[4].x,X[5].y,X[6].z,X[7].w);

	X[4] = EndianSwapa(tmp[0]);
	X[5] = EndianSwapb(tmp[1]);
	X[6] = EndianSwapb(tmp[2]);
	X[7] = EndianSwapb(tmp[3]);

	for(uint y=0; y<(nfact/LOOKUP_GAP); ++y){
		for(uint z=0; z<zSIZE; ++z)
			lookup[CO] = X[z];

#if (LOOKUP_GAP == 2)
		salsa(X, 1);
#elif (LOOKUP_GAP == 1)
		salsa(X);
#else
		for(uint i=0; i<LOOKUP_GAP; ++i)
			salsa(X);
#endif
	}
#if (LOOKUP_GAP != 1) && (LOOKUP_GAP != 2) && (LOOKUP_GAP != 4) && (LOOKUP_GAP != 8)
        uint y = (nfact/LOOKUP_GAP);
        for(uint z=0; z<zSIZE; ++z)
            lookup[CO] = X[z];
        for(uint i=0; i<nfact%LOOKUP_GAP; ++i)
            salsa(X);
#endif

#if (LOOKUP_GAP != 1)
    for (uint i=0; i<nfact; ++i){
        uint j = X[7].x & (nfact-1);
#else
	for (uint i=0; i<nfact; ++i){
		uint y = X[7].x & (nfact-1);
#endif

#if (LOOKUP_GAP == 1)
		
#elif (LOOKUP_GAP == 2)
		uint y = (j>>1);
#elif (LOOKUP_GAP == 4)
		uint y = (j>>2);
#elif (LOOKUP_GAP == 8)
		uint y = (j>>3);
#else
		uint y = (j/LOOKUP_GAP);
#endif

#if (LOOKUP_GAP != 2) && (LOOKUP_GAP != 1)
		uint4 V[8];
		for(uint z=0; z<zSIZE; ++z)
			V[z] = lookup[CO];
#endif

#if (LOOKUP_GAP == 1)
		for(uint z=0; z<zSIZE; ++z)
			X[z] ^= lookup[CO];
#elif (LOOKUP_GAP == 2)
		if(j&1){
			uint4 V[8];
			for(uint z=0; z<zSIZE; ++z)
				V[z] = lookup[CO];
			salsa(V, 0);
			for(uint z=0; z<zSIZE; ++z)
				X[z] ^= V[z];
		}else{
			for(uint z=0; z<zSIZE; ++z)
				X[z] ^= lookup[CO];
		}
#else
		uint val = j%LOOKUP_GAP;
		for (uint z=0; z<val; ++z)
			salsa(V);
#endif

#if (LOOKUP_GAP != 2) && (LOOKUP_GAP != 1)
		for(uint z=0; z<zSIZE; ++z)
			X[z] ^= V[z];
#endif

#if (LOOKUP_GAP == 2)
        salsa(X, 0);
#else
        salsa(X);
#endif
    }

	tmp[0] = (uint4)(X[3].x,X[2].y,X[1].z,X[0].w);
	tmp[1] = (uint4)(X[0].x,X[3].y,X[2].z,X[1].w);
	tmp[2] = (uint4)(X[1].x,X[0].y,X[3].z,X[2].w);
	tmp[3] = (uint4)(X[2].x,X[1].y,X[0].z,X[3].w);

	X[0] = EndianSwapa(tmp[0]);
	X[1] = EndianSwapb(tmp[1]);
	X[2] = EndianSwapb(tmp[2]);
	X[3] = EndianSwapb(tmp[3]);

	tmp[0] = (uint4)(X[7].x,X[6].y,X[5].z,X[4].w);
	tmp[1] = (uint4)(X[4].x,X[7].y,X[6].z,X[5].w);
	tmp[2] = (uint4)(X[5].x,X[4].y,X[7].z,X[6].w);
	tmp[3] = (uint4)(X[6].x,X[5].y,X[4].z,X[7].w);

	X[4] = EndianSwapa(tmp[0]);
	X[5] = EndianSwapb(tmp[1]);
	X[6] = EndianSwapb(tmp[2]);
	X[7] = EndianSwapb(tmp[3]);
}

#endif