#ifndef SCRYPT
#define SCRYPT 1

#if (CLSIZE == 64)
void scrypt_core(uint16 *Xa, uint16 *Xb,  __global CLO *restrict lookup, uint gid){
#elif (CLSIZE == 32)
void scrypt_core(uint8 *Xa, uint8 *Xb,  uint8 *Xc, uint8 *Xd, __global CLO *restrict lookup, uint gid){
#elif (CLSIZE == 16)
void scrypt_core(uint4 *Xa, uint4 *Xb,  uint4 *Xc, uint4 *Xd, uint4 *Xe, uint4 *Xf,  uint4 *Xg, uint4 *Xh, __global CLO *restrict lookup, uint gid){
#else
void scrypt_core(uint *X, __global CLO *restrict lookup, uint gid){
#endif

	DEFNFACTOR(nfact)
	const uint xSIZE = CONCURRENT_THREADS;
#if (CLSIZE == 64)
	const uint zSIZE = 2;
	uint zstep = xSIZE<<1;
#elif (CLSIZE == 32)
	const uint zSIZE = 4;
	uint zstep = xSIZE<<2;
#elif (CLSIZE == 16)
	const uint zSIZE = 8;
	uint zstep = xSIZE<<3;
#else
	const uint zSIZE = 32;
	uint zstep = xSIZE<<5;
#endif
	uint ystep;
	uint x = gid % xSIZE;
	x <= 3;

/*
x*(z ## SIZE) +

y*(x ## SIZE)*(z ## SIZE)

+ z
*/

	for(uint y=0; y<(nfact/LOOKUP_GAP); ++y){
		ystep = y*zstep;
#if (CLSIZE == 64)
		lookup[CO_01] = *Xa;
		lookup[CO_02] = *Xb;
		salsa(Xa, Xb, LOOKUP_GAP);
#elif (CLSIZE == 32)
		lookup[CO_01] = *Xa;
		lookup[CO_02] = *Xb;
		lookup[CO_03] = *Xc;
		lookup[CO_04] = *Xd;
		salsa(Xa, Xb, Xc, Xd, LOOKUP_GAP);
#elif (CLSIZE == 16)
		lookup[CO_01] = *Xa;
		lookup[CO_02] = *Xb;
		lookup[CO_03] = *Xc;
		lookup[CO_04] = *Xd;
		lookup[CO_05] = *Xe;
		lookup[CO_06] = *Xf;
		lookup[CO_07] = *Xg;
		lookup[CO_08] = *Xh;
		salsa(Xa, Xb, Xc, Xd, Xe, Xf, Xg, Xh, LOOKUP_GAP);
#else
		for(uint z=0; z<zSIZE; ++z){
			lookup[CO_00] = X[z];
		salsa(X, LOOKUP_GAP);
#endif
	}

#if (LOOKUP_GAP != 1) && (LOOKUP_GAP != 2) && (LOOKUP_GAP != 4) && (LOOKUP_GAP != 8)
	ystep = y*zstep;
#if (CLSIZE == 64)
	lookup[CO_01] = *Xa;
	lookup[CO_02] = *Xb;
	salsa(Xa, Xb, nfact%LOOKUP_GAP);
#elif (CLSIZE == 32)
	lookup[CO_01] = *Xa;
	lookup[CO_02] = *Xb;
	lookup[CO_03] = *Xc;
	lookup[CO_04] = *Xd;
	salsa(Xa, Xb, Xc, Xd, nfact%LOOKUP_GAP);
#elif (CLSIZE == 16)
	lookup[CO_01] = *Xa;
	lookup[CO_02] = *Xb;
	lookup[CO_03] = *Xc;
	lookup[CO_04] = *Xd;
	lookup[CO_05] = *Xe;
	lookup[CO_06] = *Xf;
	lookup[CO_07] = *Xg;
	lookup[CO_08] = *Xh;
	salsa(Xa, Xb, Xc, Xd, Xe, Xf, Xg, Xh, nfact%LOOKUP_GAP);
#else
	for(uint z=0; z<zSIZE; ++z)
		lookup[CO_00] = X[z];
	salsa(X, nfact%LOOKUP_GAP);
#endif

#endif

	for(uint i=0; i<nfact; ++i){
#if (LOOKUP_GAP != 1)

#if (CLSIZE == 64)
		uint j = Xb.sb & (nfact-1);
#elif (CLSIZE == 32)
		uint j = Xc.s3 & (nfact-1);
#elif (CLSIZE == 16)
		uint j = Xh.s0 & (nfact-1);
#else
		uint j = X[28] & (nfact-1);
#endif

#else

#if (CLSIZE == 64)
		uint y = Xb.sb & (nfact-1);
#elif (CLSIZE == 32)
		uint y = Xc.s3 & (nfact-1);
#elif (CLSIZE == 16)
		uint y = Xh.s0 & (nfact-1);
#else
		uint y = X[28] & (nfact-1);
#endif

#endif

#if (LOOKUP_GAP == 1)
#elif (LOOKUP_GAP == 2)
		uint y = (j>>1);
#elif (LOOKUP_GAP == 4)
		uint y = (j>>2);
#elif (LOOKUP_GAP == 8)
		uint y = (j>>3);
#else
		uint y = (j/LOOKUP_GAP);
#endif


#if (LOOKUP_GAP != 2) && (LOOKUP_GAP != 1)
		ystep = y*zstep;
#if (CLSIZE == 64)
		CLO Va = lookup[CO_01];
		CLO Vb = lookup[CO_02];

#elif (CLSIZE == 32)
		CLO Va = lookup[CO_01];
		CLO Vb = lookup[CO_02];
		CLO Vc = lookup[CO_03];
		CLO Vd = lookup[CO_04];
#elif (CLSIZE == 16)
		CLO Va = lookup[CO_01];
		CLO Vb = lookup[CO_02];
		CLO Vc = lookup[CO_03];
		CLO Vd = lookup[CO_04];
		CLO Ve = lookup[CO_05];
		CLO Vf = lookup[CO_06];
		CLO Vg = lookup[CO_07];
		CLO Vh = lookup[CO_08];
#else
		uint V[32];
		for(uint z=0; z<zSIZE; ++z)
			V[z] = lookup[CO_00];
#endif
#endif


#if (LOOKUP_GAP == 1)
		ystep = y*zstep;
#if (CLSIZE == 64)
		*Xa ^= lookup[CO_01];
		*Xb ^= lookup[CO_02];
#elif (CLSIZE == 32)
		*Xa ^= lookup[CO_01];
		*Xb ^= lookup[CO_02];
		*Xc ^= lookup[CO_03];
		*Xd ^= lookup[CO_04];
#elif (CLSIZE == 16)
		*Xa ^= lookup[CO_01];
		*Xb ^= lookup[CO_02];
		*Xc ^= lookup[CO_03];
		*Xd ^= lookup[CO_04];
		*Xe ^= lookup[CO_05];
		*Xf ^= lookup[CO_06];
		*Xg ^= lookup[CO_07];
		*Xh ^= lookup[CO_08];
#else
		for(uint z=0; z<zSIZE; ++z)
			X[z] ^= lookup[CO_00];
#endif

#elif (LOOKUP_GAP == 2)
		ystep = y*zstep;
		if(j&1){
#if (CLSIZE == 64)
			CLO Va = lookup[CO_01];
			CLO Vb = lookup[CO_02];
			salsa(Va, Vb, 0);
			*Xa ^= Va;
			*Xb ^= Vb;
#elif (CLSIZE == 32)
			CLO Va = lookup[CO_01];
			CLO Vb = lookup[CO_02];
			CLO Vc = lookup[CO_03];
			CLO Vd = lookup[CO_04];
			salsa(Va, Vb, Vc, Vd, 0);
			*Xa ^= Va;
			*Xb ^= Vb;
			*Xc ^= Vc;
			*Xd ^= Vd;
#elif (CLSIZE == 16)
			CLO Va = lookup[CO_01];
			CLO Vb = lookup[CO_02];
			CLO Vc = lookup[CO_03];
			CLO Vd = lookup[CO_04];
			CLO Ve = lookup[CO_05];
			CLO Vf = lookup[CO_06];
			CLO Vg = lookup[CO_07];
			CLO Vh = lookup[CO_08];
			salsa(Va, Vb, Vc, Vd, Ve, Vf, Vg, Vh, 0);
			*Xa ^= Va;
			*Xb ^= Vb;
			*Xc ^= Vc;
			*Xd ^= Vd;
			*Xe ^= Ve;
			*Xf ^= Vf;
			*Xg ^= Vg;
			*Xh ^= Vh;
#else
			uint V[32];
			for(uint z=0; z<zSIZE; ++z)
				V[z] = lookup[CO_00];
			salsa(V, 0);
			for(uint z=0; z<zSIZE; ++z)
				X[z] ^= V[z];
#endif

		}else{
#if (CLSIZE == 64)
			*Xa ^= lookup[CO_01];
			*Xb ^= lookup[CO_02];
#elif (CLSIZE == 32)
			*Xa ^= lookup[CO_01];
			*Xb ^= lookup[CO_02];
			*Xc ^= lookup[CO_03];
			*Xd ^= lookup[CO_04];
#elif (CLSIZE == 16)
			*Xa ^= lookup[CO_01];
			*Xb ^= lookup[CO_02];
			*Xc ^= lookup[CO_03];
			*Xd ^= lookup[CO_04];
			*Xe ^= lookup[CO_05];
			*Xf ^= lookup[CO_06];
			*Xg ^= lookup[CO_07];
			*Xh ^= lookup[CO_08];
#else
			for(uint z=0; z<zSIZE; ++z)
				X[z] ^= lookup[CO_00];
#endif
        }


#else

#if (CLSIZE == 64)
		salsa(Va, Vb, j%LOOKUP_GAP);
#elif (CLSIZE == 32)
		salsa(Va, Vb, Vc, Vd, j%LOOKUP_GAP);
#elif (CLSIZE == 16)
		salsa(Va, Vb, Vc, Vd, Ve, Vf, Vg, Vh, j%LOOKUP_GAP);
#else
		salsa(V, j%LOOKUP_GAP);
#endif


#endif


#if (LOOKUP_GAP != 2) && (LOOKUP_GAP != 1)
#if (CLSIZE == 64)
		*Xa ^= Va;
		*Xb ^= Vb;
#elif (CLSIZE == 32)
		*Xa ^= Va;
		*Xb ^= Vb;
		*Xc ^= Vc;
		*Xd ^= Vd;
#elif (CLSIZE == 16)
		*Xa ^= Va;
		*Xb ^= Vb;
		*Xc ^= Vc;
		*Xd ^= Vd;
		*Xe ^= Ve;
		*Xf ^= Vf;
		*Xg ^= Vg;
		*Xh ^= Vh;
#else
		for(uint z=0; z<zSIZE; ++z)
			X[z] ^= V[z];
#endif

#endif



#if (CLSIZE == 64)
        salsa(Xa, Xb, 0);
#elif (CLSIZE == 32)
        salsa(Xa, Xb, Xc, Xd, 0);
#elif (CLSIZE == 16)
        salsa(Xa, Xb, Xc, Xd, Xe, Xf, Xg, Xh, 0);
#else
        salsa(X, 0);
#endif

    }
}

#endif