//void scrypt_core(uint *X, __global uint *restrict lookup){
//void scrypt_core(uint4 *X, __global uint4 *restrict lookup){
//void scrypt_core(__local uint4 *lbuff, uint4 *X, __global uint4 *restrict lookup){

void scrypt_core(uint *X, __global uint *restrict lookup){
	DEFNFACTOR(nfact)
	const uint zSIZE = 32;
	//const uint zSIZE = 8;
	const uint ySIZE = (nfact/LOOKUP_GAP+(nfact%LOOKUP_GAP>0));
	const uint xSIZE = CONCURRENT_THREADS;
	uint x = (uint)(get_global_id(0)%xSIZE);
	uint4 scopy[8];
	uint z=0;

	uint4 tmp0 = (uint4)(X[4], X[9], X[15], X[3]);
	tmp0 = EndianSwapa(tmp0);
	uint4 tmp1 = (uint4)(X[8], X[13], X[2], X[7]);
	tmp1 = EndianSwapb(tmp1);
	uint4 tmp2 = (uint4)(X[12], X[1], X[6], X[11]);
	tmp2 = EndianSwapb(tmp2);
	uint4 tmp3 = (uint4)(X[0], X[5], X[10], X[15]);
	tmp3 = EndianSwapb(tmp3);

	X[0] = tmp0.x;
	X[1] = tmp0.y;
	X[2] = tmp0.z;
	X[3] = tmp0.w;

    FAcpyfromvecoffset(X, 0, tmp0)
    FAcpyfromvecoffset(X, 4, tmp1)
    FAcpyfromvecoffset(X, 8, tmp2)
    FAcpyfromvecoffset(X, 12, tmp3)

    tmp0 = (uint4)(X[20], X[25], X[30], X[19]);
    tmp0 = EndianSwapa(tmp0);
    tmp1 = (uint4)(X[24], X[29], X[18], X[23]);
    tmp1 = EndianSwapb(tmp1);
    tmp2 = (uint4)(X[28], X[17], X[22], X[27]);
    tmp2 = EndianSwapb(tmp2);
    tmp3 = (uint4)(X[16], X[21], X[26], X[31]);
    tmp3 = EndianSwapb(tmp3);

	FAcpyfromvecoffset(X, 16, tmp0)
	FAcpyfromvecoffset(X, 20, tmp1)
	FAcpyfromvecoffset(X, 24, tmp2)
	FAcpyfromvecoffset(X, 28, tmp3)

	//65536 bytes written
	for(uint y=0; y<(nfact/LOOKUP_GAP); ++y){
		//N = y*(nfact/LOOKUP_GAP+(nfactormodlg>0))*CONCURRENT_THREADS
		//atom_xchg  ((ulong)y*(y ## SIZE)*(x ## SIZE)+x*(x ## SIZE)+z)
		for(z=0; z<zSIZE; ++z){
			lookup[CO] = atomic_xchg((lookup+CO), X[z]);
		}
		Vsalsacopytv(scopy, X)

#if (LOOKUP_GAP == 2)
		salsa(scopy, 1);
#elif (LOOKUP_GAP == 1)
		salsa(scopy);
#else
		for(uint i=0; i<LOOKUP_GAP; ++i)
			salsa(scopy);
#endif
		Vsalsacopyfv(X, scopy)

	}


#if (LOOKUP_GAP != 1) && (LOOKUP_GAP != 2) && (LOOKUP_GAP != 4) && (LOOKUP_GAP != 8)
        uint y = (nfact/LOOKUP_GAP);
		//uint y = NFACTDIVLG;
        for(z=0; z<zSIZE; ++z){
			lookup[CO] = atomic_xchg((lookup+CO), X[z]);
            //lookup[CO].x = atomic_xchg(&((lookup[CO]).x), X[z].x);
            //lookup[CO].x = atomic_xchg(&((lookup[CO]).y), X[z].y);
            //lookup[CO].x = atomic_xchg(&((lookup[CO]).z), X[z].z);
            //lookup[CO].x = atomic_xchg(&((lookup[CO]).w), X[z].w);
		}
			//lookup[CO] = atom_xchg(&lookup[CO], X[z]);
            //lookup[CO] = X[z];

        for(uint i=0; i<nfact%LOOKUP_GAP; ++i)
		//for(uint i=0; i<NFACTMODLG; ++i)
            salsa(scopy);
		Vsalsacopyfv(X, scopy)
#endif


#if (LOOKUP_GAP != 1)
	for (uint i=0; i<nfact; ++i){
		uint j = X[28] & (nfact-1);
    //for (uint i=0; i<N[NFACTOR]; ++i){
      //  uint j = X[7].x & (N[NFACTOR]-1);
#else
	for (uint i=0; i<nfact; ++i){
		uint y = X[28] & (nfact-1);
	//for (uint i=0; i<N[NFACTOR]; ++i){
	//	uint y = X[7].x & (N[NFACTOR]-1);
#endif

#if (LOOKUP_GAP == 1)
		
#elif (LOOKUP_GAP == 2)
		uint y = (j>>1);
#elif (LOOKUP_GAP == 4)
		uint y = (j>>2);
#elif (LOOKUP_GAP == 8)
		uint y = (j>>3);
#else
		uint y = (j/LOOKUP_GAP);
#endif


//up to  262144 read
#if (LOOKUP_GAP != 2) && (LOOKUP_GAP != 1)
		uint4 V[8];
		z=0;
		Vsalsacopytvoff(V, lookup, CO)
		//for(z=0; z<8; ++z)
		//	V[z] = V[z];
#endif

#if (LOOKUP_GAP == 1)
		uint4 V[8];
		z=0;
		Vsalsacopytvoff(V, lookup, CO))
		for(z=0; z<8; ++z)
			X[z] ^= V[z];
#elif (LOOKUP_GAP == 2)
		//idxa = ZZERO;
		if(j&1){
			uint4 V[8];
			z=0;
			Vsalsacopytvoff(V, lookup, CO)
			/*V[0] = lookup[(idxa)];
			V[1] = lookup[(idxa+1)];
			V[2] = lookup[(idxa+2)];
			V[3] = lookup[(idxa+3)];
			V[4] = lookup[(idxa+4)];
			V[5] = lookup[(idxa+5)];
			V[6] = lookup[(idxa+6)];
			V[7] = lookup[(idxa+7)];*/
			//for(z=0; z<8; ++z)
			//	V[z] = V[z];
			salsa(V, 0);
			Vsalsaxorfv(X, V)
			//for(z=0; z<zSIZE; ++z)
			//	X[z] ^= V[z];
		}else{
			/*X[0] ^= lookup[(idxa)];
			X[1] ^= lookup[(idxa+1)];
			X[2] ^= lookup[(idxa+2)];
			X[3] ^= lookup[(idxa+3)];
			X[4] ^= lookup[(idxa+4)];
			X[5] ^= lookup[(idxa+5)];
			X[6] ^= lookup[(idxa+6)];
			X[7] ^= lookup[(idxa+7)];*/

			//Vsalsaxorfv(X, V);
			for(uint z=0; z<zSIZE; ++z)
				X[z] ^= lookup[CO];
		}
#else

		uint val = j%LOOKUP_GAP;
		for (z=0; z<val; ++z)
			salsa(V);
#endif

#if (LOOKUP_GAP != 2) && (LOOKUP_GAP != 1)
		Vsalsaxorfv(X, V)
		//for(z=0; z<; ++z)
		//	X[z] ^= V[z];
#endif

	Vsalsacopytv(scopy, X)

#if (LOOKUP_GAP == 2)
        salsa(scopy, 0);
#else
        salsa(scopy);
#endif
	Vsalsacopyfv(X, scopy)

    }

/*
	tmp[0] = (uint4)(X[3].x,X[2].y,X[1].z,X[0].w);
	tmp[1] = (uint4)(X[0].x,X[3].y,X[2].z,X[1].w);
	tmp[2] = (uint4)(X[1].x,X[0].y,X[3].z,X[2].w);
	tmp[3] = (uint4)(X[2].x,X[1].y,X[0].z,X[3].w);

	X[0] = EndianSwapa(tmp[0]);
	X[1] = EndianSwapb(tmp[1]);
	X[2] = EndianSwapb(tmp[2]);
	X[3] = EndianSwapb(tmp[3]);

	tmp[0] = (uint4)(X[7].x,X[6].y,X[5].z,X[4].w);
	tmp[1] = (uint4)(X[4].x,X[7].y,X[6].z,X[5].w);
	tmp[2] = (uint4)(X[5].x,X[4].y,X[7].z,X[6].w);
	tmp[3] = (uint4)(X[6].x,X[5].y,X[4].z,X[7].w);

	X[4] = EndianSwapa(tmp[0]);
	X[5] = EndianSwapb(tmp[1]);
	X[6] = EndianSwapb(tmp[2]);
	X[7] = EndianSwapb(tmp[3]);
*/


    tmp0 = (uint4)(X[12], X[9], X[6], X[3]);
    tmp0 = EndianSwapa(tmp0);
    tmp1 = (uint4)(X[0], X[13], X[10], X[7]);
    tmp1 = EndianSwapb(tmp1);
    tmp2 = (uint4)(X[4], X[1], X[14], X[11]);
    tmp2 = EndianSwapb(tmp2);
    tmp3 = (uint4)(X[8], X[5], X[2], X[15]);
    tmp3 = EndianSwapb(tmp3);

    FAcpyfromvecoffset(X, 0,  tmp0)
    FAcpyfromvecoffset(X, 4,  tmp1)
    FAcpyfromvecoffset(X, 8,  tmp2)
    FAcpyfromvecoffset(X, 12, tmp3)

    tmp0 = (uint4)(X[28], X[25], X[22], X[19]);
    tmp0 = EndianSwapa(tmp0);
    tmp1 = (uint4)(X[16], X[29], X[26], X[23]);
    tmp1 = EndianSwapb(tmp1);
    tmp2 = (uint4)(X[20], X[17], X[30], X[27]);
    tmp2 = EndianSwapb(tmp2);
    tmp3 = (uint4)(X[24], X[21], X[18], X[31]);
    tmp3 = EndianSwapb(tmp3);

    FAcpyfromvecoffset(X, 16, tmp0)
    FAcpyfromvecoffset(X, 20, tmp1)
    FAcpyfromvecoffset(X, 24, tmp2)
    FAcpyfromvecoffset(X, 28, tmp3)

}

