#ifndef SCRYPT
#define SCRYPT 1


#if (CLSIZE == 64)
void scrypt_core(uint16 *X, __global uint16 *restrict lookup){
#else
void scrypt_core(uint8 *X, __global uint8 *restrict lookup){
#endif

	//uint16 V[2];
	DEFNFACTOR(nfact)


//	uint xSIZE = CONCURRENT_THREADS;
//#ifdef GOFFSET
//	uint x = get_global_id(0)%xSIZE;
//#else
//	uint x = get_global_id(0);
//#endif
//	x <<= one;
//	xSIZE <<= one;

	//uint x = get_global_id(0)%CONCURRENT_THREADS*2;
	const uint xSIZE = CONCURRENT_THREADS;
//	const uint ub = (nfact/LOOKUP_GAP);
//	const uint nfmo = 0x3ffU;

//	const uint offset0 = ((nfact/LOOKUP_GAP)+(nfact%LOOKUP_GAP>0))*xSIZE;
//	const uint offset1 = offset0<<one;
//	const uint offset2 = offset0*3;

	uint x = (get_global_id(0)%CONCURRENT_THREADS);

#if (CLSIZE == 64)
	//uint ub = ((nfact/LOOKUP_GAP)*xSIZE+x)<<1;
	//uint step = (xSIZE+x)<<1;
	const uint ub = xSIZE*(nfact/LOOKUP_GAP);
	uint idx;
    X[0] = EndianSwapa(X[0].s49e38d27c16b05af);
    X[1] = EndianSwapa(X[1].s49e38d27c16b05af);
#endif

	//(nfact/LOOKUP_GAP)*xSIZE
	//uint step = x;
	//x <<= one;
//#if (CLSIZE != 64)
//	uint idx;
//#endif

//const uint xSIZE = CONCURRENT_THREADS;
//const uint ub = (nfact/LOOKUP_GAP)*xSIZE;
//uint x = (get_global_id(0)%CONCURRENT_THREADS);
//uint idx;


/*

#define CO_0 y*xSIZE+x
#define CO_1 y*xSIZE+x+1


	const uint xSIZE = 0X2U;
	const uint ySIZE = xSIZE*(nfact/LOOKUP_GAP);
	//uint x = get_global_id(0)*xSIZE*(nfact/LOOKUP_GAP);

	uint x = get_global_id(0)%CONCURRENT_THREADS;
	x *= ySIZE;


(nfact/LOOKUP_GAP)*2

uint xSIZE = CONCURRENT_THREADS;
uint x = get_global_id(0)%xSIZE;

*/

/*
	1024*CONCURRENT_THREADS*128 + CONCURRENT_THREADS*128
	y*xSIZE+x
*/

//for(uint y=0; y<ub; y+=xSIZE){
//}

// start write portion
//	while(y<ub){

//	for(uint y=0; y<ub; y+=xSIZE){
#if (CLSIZE == 64)
	for(uint y=0; y<ub; y+=xSIZE){
#else
	for(uint y=0; y<(nfact/LOOKUP_GAP); ++y){
#endif
		//for(uint z=0; z<2; z++)
		//	lookup[CO] = X[z];
		//idx = CO_0;
#if (CLSIZE == 64)
		lookup[CO_W0] = X[0];
		lookup[CO_W1] = X[1];
//		lookup[CO_0] = X[0];
//		lookup[CO_1] = X[1];
#else
		//idx = ((y*xSIZE+x)<<2);
		for(uint z=0; z<4; z++)
			lookup[CO] = X[z];

		//lookup[CO_0] = X[0];
		//lookup[CO_1] = X[1];
		//lookup[CO_2] = X[2];
		//lookup[CO_3] = X[3];
#endif

#if (LOOKUP_GAP == 2)
		salsa(X, one);
		//salsa(X);
		//salsa(X);
#elif (LOOKUP_GAP == 1)
		salsa(X);
#else
		salsa(X, LOOKUP_GAP-one);
#endif
	}

#if (LOOKUP_GAP != 1) && (LOOKUP_GAP != 2) && (LOOKUP_GAP != 4) && (LOOKUP_GAP != 8)
        uint y = (nfact/LOOKUP_GAP);
		//idx = CO_0;
# if (CLSIZE == 64)
		lookup[CO_0] = X[0];
		lookup[CO_1] = X[1];
# else
		idx = ((y*xSIZE+x)<<2);
		for(uint z=0; z<4; z++)
			lookup[CO] = X[z];
		//lookup[CO_0] = X[0];
		//lookup[CO_1] = X[1];
		//lookup[CO_2] = X[2];
		//lookup[CO_3] = X[3];
# endif
		//for(uint z=0; z<2; z++)
		//	lookup[CO] = X[z];

# if (LOOKUP_GAP == 3)
		salsa(X, zero);
# elif (LOOKUP_GAP == 5)
		salsa(X, three);
# elif (LOOKUP_GAP == 7)
		salsa(X, one);
# else
		for(uint i=0; i<nfact%LOOKUP_GAP; ++i)
			salsa(X, zero);
# endif


#endif

// end write portion


// read portion
	for(uint i=0; i<nfact; i++){

#if (CLSIZE == 64)

# if (LOOKUP_GAP == 2)
		uint j = X[1].sc & (nfact-1);
		uint y = (j>>1);
# elif (LOOKUP_GAP == 4)
		uint j = X[1].sc & (nfact-1);
		uint y = (j>>2);
# elif (LOOKUP_GAP == 8)
		uint j = X[1].sc & (nfact-1);
		uint y = (j>>3);
# elif (LOOKUP_GAP != 1)
		uint j = X[1].sc & (nfact-1);
		uint y = (j/LOOKUP_GAP);
# else
		uint y = X[1].sc & (nfact-1);
# endif

#else

# if (LOOKUP_GAP == 2)
        uint j = X[3].s4 & (nfact-1);
        uint y = (j>>1);
# elif (LOOKUP_GAP == 4)
        uint j = X[3].s4 & (nfact-1);
        uint y = (j>>2);
# elif (LOOKUP_GAP == 8)
        uint j = X[3].s4 & (nfact-1);
        uint y = (j>>3);
# elif (LOOKUP_GAP != 1)
        uint j = X[3].s4 & (nfact-1);
        uint y = (j/LOOKUP_GAP);
# else
        uint y = X[3].s4 & (nfact-1);
# endif
		//idx = ((y*xSIZE+x)<<2);


#endif


#if (LOOKUP_GAP == 2)
		if(j&1){
# if (CLSIZE == 64)
			uint16 V[2] = {lookup[CO_0], lookup[CO_1]};
			salsa(V, zero);
			X[0] ^= V[0];
			X[1] ^= V[1];
# else
			uint8 V[4];

			//V[0] = lookup[CO_0];
			//V[1] = lookup[CO_1];
			//V[2] = lookup[CO_2];
			//V[3] = lookup[CO_3];
			for(uint z=0; z<4; z++)
				V[z] = lookup[CO];
			//uint8 V[4] = {lookup[CO_0], lookup[CO_1], lookup[CO_2], lookup[CO_3]};
			salsa(V, zero);
			for(uint z=0; z<4; z++)
				X[z] ^= V[z];
# endif

		}else{
# if (CLSIZE == 64)
			X[0] ^= lookup[CO_0];
			X[1] ^= lookup[CO_1];
# else
			//X[0] ^= lookup[CO_0];
			//X[1] ^= lookup[CO_1];
			//X[2] ^= lookup[CO_2];
			//X[3] ^= lookup[CO_3];

			for(uint z=0; z<4; z++)
				X[z] ^= lookup[CO];
# endif

		}
		salsa(X, zero);

#elif (LOOKUP_GAP != 1)

/*
j = (j>>6)+(j&0x3fU);
j = (j>>4)+(j&0xfU);
j = (j>>2)+(j&0x3U);
j = (j>>2)+(j&0x3U);
j = (j>>2)+(j&0x3U);
if(j == 3)
	j=0;
*/
		j -= y*LOOKUP_GAP;
		if(j){
# if (CLSIZE == 64)
			uint16 V[2] = {lookup[CO_0], lookup[CO_1]};
			salsa(V, j-one);
			X[0] ^= V[0];
			X[1] ^= V[1];
# else
			uint8 V[4];
			//V[0] = lookup[CO_0];
			//V[1] = lookup[CO_1];
			//V[2] = lookup[CO_2];
			//V[3] = lookup[CO_3];

			for(uint z=0; z<4; z++)
				V[z] = lookup[CO];
			salsa(V, j-one);
			for(uint z=0; z<4; z++)
				X[z] ^= V[z];
# endif

		}else{
# if (CLSIZE == 64)
			X[0] ^= lookup[CO_0];
			X[1] ^= lookup[CO_1];
# else
			//X[0] ^= lookup[CO_0];
			//X[1] ^= lookup[CO_1];
			//X[2] ^= lookup[CO_2];
			//X[3] ^= lookup[CO_3];

			for(uint z=0; z<4; z++)
				X[z] ^= lookup[CO];
# endif
		}
		salsa(X, zero);
#else
		//for(uint z=0; z<2; z++)
		//	X[z] ^= lookup[CO];
# if (CLSIZE == 64)
		X[0] ^= lookup[CO_0];
		X[1] ^= lookup[CO_1];
# else
		//X[0] ^= lookup[CO_0];
		//X[1] ^= lookup[CO_1];
		//X[2] ^= lookup[CO_2];
		//X[3] ^= lookup[CO_3];

		for(uint z=0; z<4; z++)
			X[z] ^= lookup[CO];
# endif

		salsa(X);
#endif

	}
// end read portion

#if (CLSIZE == 64)
    X[0] = EndianSwapa(X[0].sc9630da741eb852f);
    X[1] = EndianSwapa(X[1].sc9630da741eb852f);
#endif

}

#endif