#ifndef SCRYPT
#define SCRYPT 1

/*
void halfsalsa(uint4 *w){
	for(uint i=0; i<4; ++i){
		w[0] ^= rotl(w[3]     +w[2]     , 7U);
		w[1] ^= rotl(w[0]     +w[3]     , 9U);
		w[2] ^= rotl(w[1]     +w[0]     ,13U);
		w[3] ^= rotl(w[2]     +w[1]     ,18U);
		w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
		w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
		w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
		w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
	}
}

#if (LOOKUP_GAP == 2)
void salsa(uint4 *B, bool db){
#else
void salsa(uint4 *B){
#endif
    uint4 w[4];

	for(uint i=0; i<4; ++i)
		w[i] = (B[i]^=B[i+4]);
	halfsalsa(w);
	for(uint i=0; i<4; ++i)
		w[i] = (B[i+4]^=(B[i]+=w[i]));
	halfsalsa(w);
#if (LOOKUP_GAP == 2)
	if(db){
		for(uint i=0; i<4; ++i)
			w[i] = (B[i]^=(B[i+4]+=w[i]));
		halfsalsa(w);
		for(uint i=0; i<4; ++i)
			w[i] = (B[i+4]^=(B[i]+=w[i]));
		halfsalsa(w);
	}
#endif
	for(uint i=0; i<4; ++i)
		B[i+4] += w[i];
}

*/


//void scrypt_core(volatile uint *XA, volatile uint *XB, volatile uint *XC, volatile uint *XD, volatile uint *XE, volatile uint *XF,
//		volatile  uint *XG, volatile uint *XH, __global uint4 *restrict lookup){

//void scrypt_core(uint16 *X, __global uint16 *restrict lookup){

/*
#if (LOOKUP_GAP == 3)

__constant bool lgt1[1024] = {
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0
};
__constant bool lgt2[1024] = {
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 
0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 
0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 
0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 
0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 
0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 
0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 
0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 
0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 
0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 
0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0
};
#endif
*/
//volatile __global uint16 *restrict
void scrypt_core(uint16 *X, __global uint16 *restrict lookup){

	DEFNFACTOR(nfact)
	//const uint zSIZE = 2;
	//const uint ySIZE = (nfact/LOOKUP_GAP+(nfact%LOOKUP_GAP>0));
	uint xSIZE = CONCURRENT_THREADS;
	uint x = get_global_id(0)%xSIZE;
	//uint4 X[8];
	x <<= 1U;
	xSIZE <<= 1U;
//#if (LOOKUP_GAP == 2)
//	xSIZE <<= 1U;
	//x <<= 1U;
//#elif (LOOKUP_GAP == 1)
//#elif (LOOKUP_GAP == 4)
	//x <<= 2U;
//	xSIZE <<= 2U;
//#elif (LOOKUP_GAP == 8)
	//x <<= 3U;
//	xSIZE <<= 3U;
//#else
	//x *= LOOKUP_GAP;
//	xSIZE *= LOOKUP_GAP;
//#endif


/*
1024*CONCURRENT_THREADS*128 + CONCURRENT_THREADS*128

	y*xSIZE+x
	uint4 tmpa = (uint4)(XE[0], XB[1], XG[1], XD[0]);
	uint4 tmpb = (uint4)(XA[1], XF[1], XC[0], XH[0]);
	uint4 tmpc = (uint4)(XE[1], XB[0], XG[0], XD[1]);
	uint4 tmpd = (uint4)(XA[0], XF[0], XC[1], XH[1]);
	X[0] = EndianSwapa(tmpa);
	X[1] = EndianSwapb(tmpb);
	X[2] = EndianSwapb(tmpc);
	X[3] = EndianSwapb(tmpd);
	tmpa = (uint4)(XE[2], XB[3], XG[3], XD[2]);
	tmpb = (uint4)(XA[3], XF[3], XC[2], XH[2]);
	tmpc = (uint4)(XE[3], XB[2], XG[2], XD[3]);
	tmpd = (uint4)(XA[2], XF[2], XC[3], XH[3]);
	X[4] = EndianSwapa(tmpa);
	X[5] = EndianSwapb(tmpb);
	X[6] = EndianSwapb(tmpc);
	X[7] = EndianSwapb(tmpd);

#define Coord(x,y,o) x+y*(x ## SIZE)+o
#define CO_0 Coord(x,y,0)
#define CO_1 Coord(x,y,1)

y*(x ## SIZE) + xa
y*(x ## SIZE) + xb

*/

	for(uint y=0; y<(nfact/LOOKUP_GAP); ++y){
		//for(uint z=0; z<2; ++z)
		lookup[CO_0] = X[0];
		lookup[CO_1] = X[1];

#if (LOOKUP_GAP == 2)
		salsa(X, 1);
#elif (LOOKUP_GAP == 1)
		salsa(X);
#else
		//for(uint i=0; i<LOOKUP_GAP; ++i)
		//	salsa(X);
		salsa(X, LOOKUP_GAP-1);
#endif
	}
#if (LOOKUP_GAP != 1) && (LOOKUP_GAP != 2) && (LOOKUP_GAP != 4) && (LOOKUP_GAP != 8)
        uint y = (nfact/LOOKUP_GAP);
        //for(uint z=0; z<zSIZE; ++z)
        //    lookup[CO] = X[z];
		lookup[CO_0] = X[0];
		lookup[CO_1] = X[1];


#if (LOOKUP_GAP == 3)
		salsa(X, 0);
#elif (LOOKUP_GAP == 5)
		salsa(X, 3);
#elif (LOOKUP_GAP == 7)
		salsa(X, 1);
#else
		for(uint i=0; i<nfact%LOOKUP_GAP; ++i)
			salsa(X, 0);
#endif

       //for(uint i=0; i<nfact%LOOKUP_GAP; ++i)
       //     salsa(X, 0);
		//salsa(X, (nfact%LOOKUP_GAP)-1);
		//salsa(X, (nfact%LOOKUP_GAP));
		//salsa(X, LOOKUP_GAP-1);
#endif



	for(uint i=0; i<nfact; i++){
		uint16 V[2];
		//uint j = X[1].sc & (nfact-1);
#if (LOOKUP_GAP == 2)
		uint j = X[1].sc & (nfact-1);
		uint y = (j>>1);
#elif (LOOKUP_GAP == 4)
		uint j = X[1].sc & (nfact-1);
		uint y = (j>>2);
#elif (LOOKUP_GAP == 8)
		uint j = X[1].sc & (nfact-1);
		uint y = (j>>3);
//#elif (LOOKUP_GAP == 3)

//		uint j = X[1].sc & (nfact-1);
//		uint y = (j/LOOKUP_GAP);

#elif (LOOKUP_GAP != 1)
//#else
		uint j = X[1].sc & (nfact-1);
		uint y = (j/LOOKUP_GAP);
//#elif (LOOKUP_GAP == 3)
		//uint j = X[1].sc & (nfact-1);
		//uint y = X[1].sc & (nfact-1);
		//uint j = y;
		//ushort j = (ushort)(X[1].sc & (nfact-1));
		//uint y = j;
#else
		uint y = X[1].sc & (nfact-1);
//		uint y = j;
#endif
		V[0] = lookup[CO_0];
		V[1] = lookup[CO_1];
#if (LOOKUP_GAP == 2)
		if(j&1)
			salsa(V, 0);
#elif (LOOKUP_GAP != 1)
//		uint val = j%LOOKUP_GAP;
		//for (uint z=0; z<val; ++z)
		//	salsa(V, 0);
//		if(val)
//			salsa(V, val-1);
//#if (LOOKUP_GAP == 3)
		//if(j>>30)
		//	salsa(V, lgt[j]);

/*
j = (j>>6)+(j&0x3fU);
j = (j>>4)+(j&0xfU);
j = (j>>2)+(j&0x3U);
j = (j>>2)+(j&0x3U);
j = (j>>2)+(j&0x3U);

//j = (j == 3) ? 0 : j;
//if(j != 3)
//	salsa(V, j);
if(j == 3)
	j = 0;
if(j)
	salsa(V, j-1);
*/

//y = j-y*3;
//if(y)
//	salsa(V, y-1);


//uint val = j%LOOKUP_GAP;
  //      if(val)
    //        salsa(V, val-1);

//if(y==1)
//	salsa(y, 0);
//if(y==2)
//	salsa(y, 1);

		//if(lgt1[j])
		//	salsa(V, lgt2[j]);
//#else
		y = j-y*LOOKUP_GAP;
		if(y)
			salsa(V, y-1);

//		uint val = j%LOOKUP_GAP;
//		if(val)
//			salsa(V, val-1);
#else
//salsa(V);
#endif
//for 3: 0 1 2
//		if(j&1)
//			salsa(V, LOOKUP_GAP-1);
			//uint val = j%LOOKUP_GAP;
			//if(val > 0)
			//	salsa(V, val-1);
		//}
		//if(val == 1)
		//	salsa(V, 0);
		//if(val == 2)
		//	salsa(V, 1);
		//salsa(V, j%LOOKUP_GAP);

//#else
//		salsa(V);
//#endif

		X[0] ^= V[0];
		X[1] ^= V[1];
#if (LOOKUP_GAP == 2)
		salsa(X, 0);
#elif (LOOKUP_GAP != 1)
		//if(val == 0)
		salsa(X, 0);
#else
		salsa(X);
#endif
	}
}

/*
	tmpa = (uint4)(X[3].x,X[2].y,X[1].z,X[0].w);
	tmpb = (uint4)(X[0].x,X[3].y,X[2].z,X[1].w);
	tmpc = (uint4)(X[1].x,X[0].y,X[3].z,X[2].w);
	tmpd = (uint4)(X[2].x,X[1].y,X[0].z,X[3].w);
	tmpa = EndianSwapa(tmpa);
	XA[0] = tmpa.x;
	XB[0] = tmpa.y;
	XC[0] = tmpa.z;
	XD[0] = tmpa.w;
	tmpb = EndianSwapb(tmpb);
	XE[0] = tmpb.x;
	XF[0] = tmpb.y;
	XG[0] = tmpb.z;
	XH[0] = tmpb.w;
	tmpc = EndianSwapb(tmpc);
	XA[1] = tmpc.x;
	XB[1] = tmpc.y;
	XC[1] = tmpc.z;
	XD[1] = tmpc.w;
	tmpd = EndianSwapb(tmpd);
	XE[1] = tmpd.x;
	XF[1] = tmpd.y;
	XG[1] = tmpd.z;
	XH[1] = tmpd.w;
	tmpa = (uint4)(X[7].x,X[6].y,X[5].z,X[4].w);
	tmpb = (uint4)(X[4].x,X[7].y,X[6].z,X[5].w);
	tmpc = (uint4)(X[5].x,X[4].y,X[7].z,X[6].w);
	tmpd = (uint4)(X[6].x,X[5].y,X[4].z,X[7].w);
	tmpa = EndianSwapa(tmpa);
	XA[2] = tmpa.x;
	XB[2] = tmpa.y;
	XC[2] = tmpa.z;
	XD[2] = tmpa.w;
	tmpb = EndianSwapb(tmpb);
	XE[2] = tmpb.x;
	XF[2] = tmpb.y;
	XG[2] = tmpb.z;
	XH[2] = tmpb.w;
	tmpc = EndianSwapb(tmpc);
	XA[3] = tmpc.x;
	XB[3] = tmpc.y;
	XC[3] = tmpc.z;
	XD[3] = tmpc.w;
	tmpd = EndianSwapb(tmpd);
	XE[3] = tmpd.x;
	XF[3] = tmpd.y;
	XG[3] = tmpd.z;
	XH[3] = tmpd.w;
*/

#endif