/*-
 * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt,
 * 2012-2013 Con Kolivas.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * This file was originally written by Colin Percival as part of the Tarsnap
 * online backup system.
 */

/* N (nfactor), CPU/Memory cost parameter */
//#ifndef NFACTOR
//#define NFACTOR 10
//#endif
/*
#if NFACTOR == 1
#  define __NFACTOR 2u
#elif NFACTOR == 2
# define __NFACTOR 4u
#elif NFACTOR == 3
#  define __NFACTOR 8u
#elif NFACTOR == 4
#  define __NFACTOR 16u
#elif NFACTOR == 5
#  define __NFACTOR 32u
#elif NFACTOR == 6
#  define __NFACTOR 64u
#elif NFACTOR == 7
#  define __NFACTOR 128u
#elif NFACTOR == 8
#  define __NFACTOR 256u
#elif NFACTOR == 9
#  define __NFACTOR 512u
#elif NFACTOR == 10
#  define __NFACTOR 1024u
#elif NFACTOR == 11
#  define __NFACTOR 2048u
#elif NFACTOR == 12
#  define __NFACTOR 4096u
#elif NFACTOR == 13
#  define __NFACTOR 8192u
#elif NFACTOR == 14
#  define __NFACTOR 16384u
#elif NFACTOR == 15
#  define __NFACTOR 32768u
#elif NFACTOR == 16
#  define __NFACTOR 65536u
#elif NFACTOR == 17
#  define __NFACTOR 131072u
#elif NFACTOR == 18
#  define __NFACTOR 262144u
#elif NFACTOR == 19
#  define __NFACTOR 524288u
#elif NFACTOR == 20
#  define __NFACTOR 1048576u
#else
# define __NFACTOR 1024u
#endif

#define DEFNFACTOR(n) \
    const uint n = __NFACTOR;


//Search constants
#define DecAllSK    uint SK00 = 0x5C5C5C5CU; \
                    uint SK01 = 0x36363636U; \
                    uint SK02 = 0x80000000U; \
                    uint SK03 = 0x00000280U; \
                    uint SK04 = 0x000004a0U; \
                    uint SK05 = 0x00000300U; \
					uint SK06 = 0x0U;

*/

#define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE)
#define CO Coord(z,x,y)

//#define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE)
//#define CO Coord(z,x,y)
//#define Coord(x,y,z) ((ulong)y*(x ## SIZE)*(z ## SIZE)+x*(z ## SIZE)+z)
//#define CO Coord(x,y,z)
//#define Coord(x,y,z) (ulong)x*(z ## SIZE)+y*(x ## SIZE)*(z ## SIZE)+z
//#define CO Coord(x,y,z)

//#define COORDINDEX(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE)

//#define IDX COORDINDEX(z,x,y)
//#define COORDINDEX(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE)
//#define CO Coord(z,x,y)
//#define CO COORDINDEX(z,x,y)

#define E0 0x00FF00FFU
#define E1 0xFF00FF00U

#define rotl(x,y) rotate(x,y)
#define Ch(x,y,z) bitselect(z,y,x)
#define Maj(x,y,z) Ch((x^z),y,z)

#define EndianSwapa(n) (Ch(E0, rotl(n, 8U), rotl(n, 24U)))
#define EndianSwapb(n) (rotl(n & E0, 24U)|rotl(n & E1, 8U))

#define Tr2(x)		(rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U))
#define Tr1(x)		(rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U))
#define Wr2(x)		(rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U))
#define Wr1(x)		(rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U))

#define RND(a, b, c, d, e, f, g, h, k)	\
	h += Tr1(e); 			\
	h += Ch(e, f, g); 		\
	h += k;				\
	d += h;				\
	h += Tr2(a); 			\
	h += Maj(a, b, c);



__constant uint K[] = {
	0x428a2f98U,
	0x71374491U,
	0xb5c0fbcfU,
	0xe9b5dba5U,
	0x3956c25bU,
	0x59f111f1U,
	0x923f82a4U,
	0xab1c5ed5U,
	0xd807aa98U,
	0x12835b01U,
	0x243185beU, // 10
	0x550c7dc3U,
	0x72be5d74U,
	0x80deb1feU,
	0x9bdc06a7U,
	0xe49b69c1U,
	0xefbe4786U,
	0x0fc19dc6U,
	0x240ca1ccU,
	0x2de92c6fU,
	0x4a7484aaU, // 20
	0x5cb0a9dcU,
	0x76f988daU,
	0x983e5152U,
	0xa831c66dU,
	0xb00327c8U,
	0xbf597fc7U,
	0xc6e00bf3U,
	0xd5a79147U,
	0x06ca6351U,
	0x14292967U, // 30
	0x27b70a85U,
	0x2e1b2138U,
	0x4d2c6dfcU,
	0x53380d13U,
	0x650a7354U,
	0x766a0abbU,
	0x81c2c92eU,
	0x92722c85U,
	0xa2bfe8a1U,
	0xa81a664bU, // 40
	0xc24b8b70U,
	0xc76c51a3U,
	0xd192e819U,
	0xd6990624U,
	0xf40e3585U,
	0x106aa070U,
	0x19a4c116U,
	0x1e376c08U,
	0x2748774cU,
	0x34b0bcb5U, // 50
	0x391c0cb3U,
	0x4ed8aa4aU,
	0x5b9cca4fU,
	0x682e6ff3U,
	0x748f82eeU,
	0x78a5636fU,
	0x84c87814U,
	0x8cc70208U,
	0x90befffaU,
	0xa4506cebU, // 60
	0xbef9a3f7U,
	0xc67178f2U,
	0x98c7e2a2U,
	0xfc08884dU,
	0xcd2a11aeU,
	0x510e527fU,
	0x9b05688cU,
	0xC3910C8EU,
	0xfb6feee7U,
	0x2a01a605U, // 70
	0x0c2e12e0U,
	0x4498517BU,
	0x6a09e667U,
	0xa4ce148bU,
	0x95F61999U,
	0xc19bf174U,
	0xBB67AE85U,
	0x3C6EF372U,
	0xA54FF53AU,
	0x1F83D9ABU, // 80
	0x5BE0CD19U,
    0x5C5C5C5CU,
    0x36363636U,
    0x80000000U,
    0x000003FFU,
    0x00000280U,
    0x000004a0U,
    0x00000300U
};



void uintp_to_uint4p(uint4 *dest, uint *source){
    uint4 tmp;
    tmp.x = source[0];
    tmp.y = source[1];
    tmp.z = source[2];
    tmp.w = source[3];
    *dest = tmp;
}

void uint4p_to_uintp(uint *dest, uint4 *source){
    uint4 tmp = *source;;
        dest[0] = tmp.x;
        dest[1] = tmp.y;
        dest[2] = tmp.z;
        dest[3] = tmp.w;
}

void uint4_to_uintp(uint *dest, uint4 source){
        dest[0] = source.x;
        dest[1] = source.y;
        dest[2] = source.z;
        dest[3] = source.w;
}

void SHA256(uint*restrict state, uint*restrict block, bool notfresh){
    uint A = state[0];
    uint B = state[1];
    uint C = state[2];
    uint D = state[3];
    uint E = state[4];
    uint F = state[5];
    uint G = state[6];
    uint H = state[7];
	uint W0x = block[0];
	uint W0y = block[1];
	uint W0z = block[2];
	uint W0w = block[3];
    uint W1x = block[4];
    uint W1y = block[5];
    uint W1z = block[6];
    uint W1w = block[7];
    uint W2x = block[8];
    uint W2y = block[9];
    uint W2z = block[10];
    uint W2w = block[11];
    uint W3x = block[12];
    uint W3y = block[13];
    uint W3z = block[14];
    uint W3w = block[15];
    uint T0x = state[0];
    uint T0y = state[1];
    uint T0z = state[2];
    uint T0w = state[3];
    uint T1x = state[4];
    uint T1y = state[5];
    uint T1z = state[6];
    uint T1w = state[7];

	uint K0, K1, K2, K3, K4, K5, K6, K7;

	if(notfresh){
		K0 = K[0];
		K1 = K[1];
		K2 = K[2];
		K3 = K[3];
		RND(A,B,C,D,E,F,G,H, W0x + K0);
		RND(H,A,B,C,D,E,F,G, W0y + K1);
		RND(G,H,A,B,C,D,E,F, W0z + K2);
		RND(F,G,H,A,B,C,D,E, W0w + K3);
	}else{
		K0 = K[63];
		K1 = K[64];
		K2 = K[65];
		K3 = K[66];
		K4 = K[67];
        K5 = K[68];
        K6 = K[69];
        K7 = K[70];

		D = K0 + W0x;
		H = K1 + W0x;
		C = K2 + Tr1(D) + Ch(D, K3, K4) + W0y;
		G = K5 + C + Tr2(H) + Ch(H, K6, K7);
		T1x = K3;
		T1y = K4;
        K4 = K[71];
        K5 = K[72];
        K6 = K[73];
        K7 = K[74];
        K0 = K[75];
		B = K4 + Tr1(C) + Ch(C,D,K3) + W0z;
		F = K5 + B + Tr2(G) + Maj(G,H, K6);
		A = K7 + Tr1(B) + Ch(B,C,D) + W0w;
		E = K0 + A + Tr2(F) + Maj(F,G,H);

		//save K constants for fresh condition
		T0x = K6;
		T0y = K[77];
		T0z = K[78];
		T0w = K[79];

		T1z = K[80];
		T1w = K[81];
	}
	
    K0 = K[4];
    K1 = K[5];
    K2 = K[6];
    K3 = K[7];
    K4 = K[8];
    K5 = K[9];
    K6 = K[10];
    K7 = K[11];

	RND(E,F,G,H,A,B,C,D, W1x + K0);
	RND(D,E,F,G,H,A,B,C, W1y + K1);
	RND(C,D,E,F,G,H,A,B, W1z + K2);
	RND(B,C,D,E,F,G,H,A, W1w + K3);

	RND(A,B,C,D,E,F,G,H, W2x + K4);
	RND(H,A,B,C,D,E,F,G, W2y + K5);
	RND(G,H,A,B,C,D,E,F, W2z + K6);
	RND(F,G,H,A,B,C,D,E, W2w + K7);

    K7 = K[76];
    K0 = K[12];
    K1 = K[13];
    K2 = K[14];
    K3 = K[15];
    K4 = K[16];
    K5 = K[17];
    K6 = K[18];

	RND(E,F,G,H,A,B,C,D, W3x + K0);
	RND(D,E,F,G,H,A,B,C, W3y + K1);
	RND(C,D,E,F,G,H,A,B, W3z + K2);
	RND(B,C,D,E,F,G,H,A, W3w + K7);


	W0x += Wr1(W3z) + W2y + Wr2(W0y);
	RND(A,B,C,D,E,F,G,H, W0x + K3);
	W0y += Wr1(W3w) + W2z + Wr2(W0z);
	RND(H,A,B,C,D,E,F,G, W0y + K4);
	W0z += Wr1(W0x) + W2w + Wr2(W0w);
	RND(G,H,A,B,C,D,E,F, W0z + K5);
	W0w += Wr1(W0y) + W3x + Wr2(W1x);
	RND(F,G,H,A,B,C,D,E, W0w + K6);

    K0 = K[19];
    K1 = K[20];
    K2 = K[21];
    K3 = K[22];
    K4 = K[23];
    K5 = K[24];
    K6 = K[25];
    K7 = K[26];

	W1x += Wr1(W0z) + W3y + Wr2(W1y);
	RND(E,F,G,H,A,B,C,D, W1x + K0);
	W1y += Wr1(W0w) + W3z + Wr2(W1z);
	RND(D,E,F,G,H,A,B,C, W1y + K1);
	W1z += Wr1(W1x) + W3w + Wr2(W1w);
	RND(C,D,E,F,G,H,A,B, W1z + K2);
	W1w += Wr1(W1y) + W0x + Wr2(W2x);
	RND(B,C,D,E,F,G,H,A, W1w + K3);

	W2x += Wr1(W1z) + W0y + Wr2(W2y);
	RND(A,B,C,D,E,F,G,H, W2x + K4);
	W2y += Wr1(W1w) + W0z + Wr2(W2z);
	RND(H,A,B,C,D,E,F,G, W2y + K5);
	W2z += Wr1(W2x) + W0w + Wr2(W2w);
	RND(G,H,A,B,C,D,E,F, W2z + K6);
	W2w += Wr1(W2y) + W1x + Wr2(W3x);
	RND(F,G,H,A,B,C,D,E, W2w + K7);

    K0 = K[27];
    K1 = K[28];
    K2 = K[29];
    K3 = K[30];
    K4 = K[31];
    K5 = K[32];
    K6 = K[33];
    K7 = K[34];

	W3x += Wr1(W2z) + W1y + Wr2(W3y);
	RND(E,F,G,H,A,B,C,D, W3x + K0);
	W3y += Wr1(W2w) + W1z + Wr2(W3z);
	RND(D,E,F,G,H,A,B,C, W3y + K1);
	W3z += Wr1(W3x) + W1w + Wr2(W3w);
	RND(C,D,E,F,G,H,A,B, W3z + K2);
	W3w += Wr1(W3y) + W2x + Wr2(W0x);
	RND(B,C,D,E,F,G,H,A, W3w + K3);

	W0x += Wr1(W3z) + W2y + Wr2(W0y);
	RND(A,B,C,D,E,F,G,H, W0x + K4);
	W0y += Wr1(W3w) + W2z + Wr2(W0z);
	RND(H,A,B,C,D,E,F,G, W0y + K5);
	W0z += Wr1(W0x) + W2w + Wr2(W0w);
	RND(G,H,A,B,C,D,E,F, W0z + K6);
	W0w += Wr1(W0y) + W3x + Wr2(W1x);
	RND(F,G,H,A,B,C,D,E, W0w + K7);

    K0 = K[35];
    K1 = K[36];
    K2 = K[37];
    K3 = K[38];
    K4 = K[39];
    K5 = K[40];
    K6 = K[41];
    K7 = K[42];

	W1x += Wr1(W0z) + W3y + Wr2(W1y);
	RND(E,F,G,H,A,B,C,D, W1x + K0);
	W1y += Wr1(W0w) + W3z + Wr2(W1z);
	RND(D,E,F,G,H,A,B,C, W1y + K1);
	W1z += Wr1(W1x) + W3w + Wr2(W1w);
	RND(C,D,E,F,G,H,A,B, W1z + K2);
	W1w += Wr1(W1y) + W0x + Wr2(W2x);
	RND(B,C,D,E,F,G,H,A, W1w + K3);

	W2x += Wr1(W1z) + W0y + Wr2(W2y);
	RND(A,B,C,D,E,F,G,H, W2x + K4);
	W2y += Wr1(W1w) + W0z + Wr2(W2z);
	RND(H,A,B,C,D,E,F,G, W2y + K5);
	W2z += Wr1(W2x) + W0w + Wr2(W2w);
	RND(G,H,A,B,C,D,E,F, W2z + K6);
	W2w += Wr1(W2y) + W1x + Wr2(W3x);
	RND(F,G,H,A,B,C,D,E, W2w + K7);

    K0 = K[43];
    K1 = K[44];
    K2 = K[45];
    K3 = K[46];
    K4 = K[47];
    K5 = K[48];
    K6 = K[49];
    K7 = K[50];

	W3x += Wr1(W2z) + W1y + Wr2(W3y);
	RND(E,F,G,H,A,B,C,D, W3x + K0);
	W3y += Wr1(W2w) + W1z + Wr2(W3z);
	RND(D,E,F,G,H,A,B,C, W3y + K1);
	W3z += Wr1(W3x) + W1w + Wr2(W3w);
	RND(C,D,E,F,G,H,A,B, W3z + K2);
	W3w += Wr1(W3y) + W2x + Wr2(W0x);
	RND(B,C,D,E,F,G,H,A, W3w + K3);

	W0x += Wr1(W3z) + W2y + Wr2(W0y);
	RND(A,B,C,D,E,F,G,H, W0x + K4);
	W0y += Wr1(W3w) + W2z + Wr2(W0z);
	RND(H,A,B,C,D,E,F,G, W0y + K5);
	W0z += Wr1(W0x) + W2w + Wr2(W0w);
	RND(G,H,A,B,C,D,E,F, W0z + K6);
	W0w += Wr1(W0y) + W3x + Wr2(W1x);
	RND(F,G,H,A,B,C,D,E, W0w + K7);

    K0 = K[51];
    K1 = K[52];
    K2 = K[53];
    K3 = K[54];
    K4 = K[55];
    K5 = K[56];
    K6 = K[57];
    K7 = K[58];

	W1x += Wr1(W0z) + W3y + Wr2(W1y);
	RND(E,F,G,H,A,B,C,D, W1x + K0);
	W1y += Wr1(W0w) + W3z + Wr2(W1z);
	RND(D,E,F,G,H,A,B,C, W1y + K1);
	W1z += Wr1(W1x) + W3w + Wr2(W1w);
	RND(C,D,E,F,G,H,A,B, W1z + K2);
	W1w += Wr1(W1y) + W0x + Wr2(W2x);
	RND(B,C,D,E,F,G,H,A, W1w + K3);

	W2x += Wr1(W1z) + W0y + Wr2(W2y);
	RND(A,B,C,D,E,F,G,H, W2x + K4);
	W2y += Wr1(W1w) + W0z + Wr2(W2z);
	RND(H,A,B,C,D,E,F,G, W2y + K5);
	W2z += Wr1(W2x) + W0w + Wr2(W2w);
	RND(G,H,A,B,C,D,E,F, W2z + K6);
	W2w += Wr1(W2y) + W1x + Wr2(W3x);
	RND(F,G,H,A,B,C,D,E, W2w + K7);

    K4 = K[59];
    K5 = K[60];
    K6 = K[61];
    K7 = K[62];

	W3x += Wr1(W2z) + W1y + Wr2(W3y);
	RND(E,F,G,H,A,B,C,D, W3x + K4);
	W3y += Wr1(W2w) + W1z + Wr2(W3z);
	RND(D,E,F,G,H,A,B,C, W3y + K5);
	W3z += Wr1(W3x) + W1w + Wr2(W3w);
	RND(C,D,E,F,G,H,A,B, W3z + K6);
	W3w += Wr1(W3y) + W2x + Wr2(W0x);
	RND(B,C,D,E,F,G,H,A, W3w + K7);

//if(notfresh){	


//    T0x += A;
//    T0y += B;
//    T0z += C;
//    T0w += D;
//    T1x += E;
//    T1y += F;
//    T1z += G;
//    T1w += H;
	A += T0x;
	B += T0y;
	C += T0z;
	D += T0w;
	E += T1x;
	F += T1y;
	G += T1z;
	H += T1w;



/*
    state[0] += A;
    state[1] += B;
    state[2] += C;
    state[3] += D;
    state[4] += E;
    state[5] += F;
    state[6] += G;
    state[7] += H;
//     += (uint4)(K[73], K[77], K[78], K[79]);
  //   += (uint4)(K[66], K[67], K[80], K[81]);
A += K[73];
B += K[77];
C += K[78];
D += K[79];
E += K[66];
F += K[67];
G += K[80];
H += K[81];
*/


    state[0] = A;
    state[1] = B;
    state[2] = C;
    state[3] = D;
    state[4] = E;
    state[5] = F;
    state[6] = G;
    state[7] = H;





/*
    state[0] = T0x;
    state[1] = T0y;
    state[2] = T0z;
    state[3] = T0w;
    state[4] = T1x;
    state[5] = T1y;
    state[6] = T1z;
    state[7] = T1w;
*/

}












void halfsalsa(uint4 *w){
	for(uint i=0; i<4; ++i){
		w[0] ^= rotl(w[3]     +w[2]     , 7U);
		w[1] ^= rotl(w[0]     +w[3]     , 9U);
		w[2] ^= rotl(w[1]     +w[0]     ,13U);
		w[3] ^= rotl(w[2]     +w[1]     ,18U);
		w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
		w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
		w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
		w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
	}
}

#if (LOOKUP_GAP == 2)
void salsa(uint4 *B, bool db){
#else
void salsa(uint4 *B){
#endif
    uint4 w[4];

	for(uint i=0; i<4; ++i)
		w[i] = (B[i]^=B[i+4]);
	halfsalsa(w);
	for(uint i=0; i<4; ++i)
		w[i] = (B[i+4]^=(B[i]+=w[i]));
	halfsalsa(w);
#if (LOOKUP_GAP == 2)
	if(db){
		for(uint i=0; i<4; ++i)
			w[i] = (B[i]^=(B[i+4]+=w[i]));
		halfsalsa(w);
		for(uint i=0; i<4; ++i)
			w[i] = (B[i+4]^=(B[i]+=w[i]));
		halfsalsa(w);
	}
#endif
	for(uint i=0; i<4; ++i)
		B[i+4] += w[i];
}


void scrypt_core(uint4 *X, __global uint4 *restrict lookup){
	//DEFNFACTOR(nfact)
	const uint nfact = 1024;
	const uint ySIZE = (nfact/LOOKUP_GAP+(nfact%LOOKUP_GAP>0));
	const uint zSIZE = 8;
	const uint xSIZE = CONCURRENT_THREADS;
	uint x = get_global_id(0)%xSIZE;

	uint4 tmpa = (uint4)(X[1].x,X[2].y,X[3].z,X[0].w);
	uint4 tmpb = (uint4)(X[2].x,X[3].y,X[0].z,X[1].w);
	uint4 tmpc = (uint4)(X[3].x,X[0].y,X[1].z,X[2].w);
	uint4 tmpd = (uint4)(X[0].x,X[1].y,X[2].z,X[3].w);

	X[0] = EndianSwapa(tmpa);
	X[1] = EndianSwapb(tmpb);
	X[2] = EndianSwapb(tmpc);
	X[3] = EndianSwapb(tmpd);

	tmpa = (uint4)(X[5].x,X[6].y,X[7].z,X[4].w);
	tmpb = (uint4)(X[6].x,X[7].y,X[4].z,X[5].w);
	tmpc = (uint4)(X[7].x,X[4].y,X[5].z,X[6].w);
	tmpd = (uint4)(X[4].x,X[5].y,X[6].z,X[7].w);

	X[4] = EndianSwapa(tmpa);
	X[5] = EndianSwapb(tmpb);
	X[6] = EndianSwapb(tmpc);
	X[7] = EndianSwapb(tmpd);

	for(uint y=0; y<(nfact/LOOKUP_GAP); ++y){
		for(uint z=0; z<zSIZE; ++z)
			lookup[CO] = X[z];
#if (LOOKUP_GAP == 2)
		salsa(X, 1);
#elif (LOOKUP_GAP == 1)
		salsa(X);
#else
		for(uint i=0; i<LOOKUP_GAP; ++i)
			salsa(X);
#endif
	}
#if (LOOKUP_GAP != 1) && (LOOKUP_GAP != 2) && (LOOKUP_GAP != 4) && (LOOKUP_GAP != 8)
	uint y = (nfact/LOOKUP_GAP);
	for(uint z=0; z<zSIZE; ++z)
		lookup[CO] = X[z];
	for(uint i=0; i<(nfact%LOOKUP_GAP); ++i)
		salsa(X);
#endif

#if (LOOKUP_GAP != 1)
    for (uint i=0; i<nfact; ++i){
        uint j = X[7].x & (nfact-1);
#else
	for (uint i=0; i<nfact; ++i){
		uint y = X[7].x & (nfact-1);
#endif

#	if (LOOKUP_GAP == 1)
#	elif (LOOKUP_GAP == 2)
		uint y = (j>>1);
#	elif (LOOKUP_GAP == 4)
		uint y = (j>>2);
#	elif (LOOKUP_GAP == 8)
		uint y = (j>>3);
#	else
		uint y = (j/LOOKUP_GAP);
#	endif

#if (LOOKUP_GAP != 2) && (LOOKUP_GAP != 1)
		uint4 V[8];
		for(uint z=0; z<zSIZE; ++z)
			V[z] = lookup[CO];
#endif

#if (LOOKUP_GAP == 1)
		for(uint z=0; z<zSIZE; ++z)
			X[z] ^= lookup[CO];
#elif (LOOKUP_GAP == 2)
		if(j&1){
			uint4 V[8];
			for(uint z=0; z<zSIZE; ++z)
				V[z] = lookup[CO];
			salsa(V, 0);
			for(uint z=0; z<zSIZE; ++z)
				X[z] ^= V[z];
		}else{

			for(uint z=0; z<zSIZE; ++z)
				X[z] ^= lookup[CO];
		}
#else
		//uint val = j%LOOKUP_GAP;	
		for(uint z=0; z<zSIZE; ++z)
			X[z] ^= V[z];
		//uint val = j%LOOKUP_GAP;
		//for (uint z=0; z<val; ++z)
		//	salsa(V);
#endif

#if (LOOKUP_GAP != 2) && (LOOKUP_GAP != 1)
#endif

#if (LOOKUP_GAP == 2)
        salsa(X, 0);
#elif (LOOKUP_GAP == 1)
        salsa(X);
#else
		uint val = j%LOOKUP_GAP;
		for (uint z=0; z<val; ++z)
			salsa(V);
#endif
    }

	tmpa = (uint4)(X[3].x,X[2].y,X[1].z,X[0].w);
	tmpb = (uint4)(X[0].x,X[3].y,X[2].z,X[1].w);
	tmpc = (uint4)(X[1].x,X[0].y,X[3].z,X[2].w);
	tmpd = (uint4)(X[2].x,X[1].y,X[0].z,X[3].w);

	X[0] = EndianSwapa(tmpa);
	X[1] = EndianSwapb(tmpb);
	X[2] = EndianSwapb(tmpc);
	X[3] = EndianSwapb(tmpd);

	tmpa = (uint4)(X[7].x,X[6].y,X[5].z,X[4].w);
	tmpb = (uint4)(X[4].x,X[7].y,X[6].z,X[5].w);
	tmpc = (uint4)(X[5].x,X[4].y,X[7].z,X[6].w);
	tmpd = (uint4)(X[6].x,X[5].y,X[4].z,X[7].w);

	X[4] = EndianSwapa(tmpa);
	X[5] = EndianSwapb(tmpb);
	X[6] = EndianSwapb(tmpc);
	X[7] = EndianSwapb(tmpd);
}


__constant uint fixedWa[8] = {0x428a2f99,0xd807aa98,0xf59b89c2,0xb707775c,0xad87a3ea,0xc91b1417,0xe64fb6a2,0xe0a1adbe};
__constant uint fixedWb[8] = {0xf1374491,0x12835b01,0x73924787,0x0468c23f,0xbcb1d3a3,0xc359dce1,0xe84d923a,0x7c728e11};
__constant uint fixedWc[8] = {0xb5c0fbcf,0x243185be,0x23c6886e,0xe7e72b4c,0x7b993186,0xa83253a7,0xe93a5730,0x511c78e4};
__constant uint fixedWd[8] = {0xe9b5dba5,0x550c7dc3,0xa42ca65c,0x49e1f1a2,0x562b9420,0x3b13c12d,0x09837686,0x315b45bd};
__constant uint fixedWe[8] = {0x3956c25b,0x72be5d74,0x15ed3627,0x4b99c816,0xbff3ca0c,0x9d3d725d,0x078ff753,0xfca71413};
__constant uint fixedWf[8] = {0x59f111f1,0x80deb1fe,0x4d6edcbf,0x926d1570,0xda4b0c23,0xd9031a84,0x29833341,0xea28f96a};
__constant uint fixedWg[8] = {0x923f82a4,0x9bdc06a7,0xe28217fc,0xaa0fc072,0x6cd8711a,0xb1a03340,0xd5de0b7e,0x79703128};
__constant uint fixedWh[8] = {0xab1c5ed5,0xc19bf794,0xef02488f,0xadb36e2c,0x8f337caa,0x16f58012,0x6948ccf4,0x4e1ef848};


#define FOUND (0xFF)
#define SETFOUND(Xnonce) output[output[FOUND]++] = Xnonce

__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(__global const uint4 * restrict input,
volatile __global uint*restrict output, __global uint4*restrict padcache,
const uint4 midstate0, const uint4 midstate16, const uint target){
	uint gid = get_global_id(0);
	//DecAllSK
	uint4 X[8];
	uint tstate[8]; //= {0, 0, 0, 0, 0, 0, 0, 0};
	uint ostate[8]; //= {0, 0, 0, 0, 0, 0, 0, 0};
	uint tstatebak[8];
	uint tmp[16];
	//uint pad[16];
	//uint data[16];

	uint pad[16] = {midstate0.x, midstate0.y, midstate0.z, midstate0.w, midstate16.x, midstate16.y, midstate16.z, midstate16.w,
					K[82], K[82], K[82], K[82], K[82], K[82], K[82], K[82]};
					//SK00, SK00, SK00, SK00, SK00, SK00, SK00, SK00};
					//0x5C5C5C5CU, 0x5C5C5C5CU, 0x5C5C5C5CU, 0x5C5C5C5CU, 0x5C5C5C5CU, 0x5C5C5C5CU, 0x5C5C5C5CU, 0x5C5C5C5CU};
	uint data[16] = {input[4].x, input[4].y, input[4].z, gid, K[84], 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, K[86]};
					
					//0x80000000U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x00000280U};
					//K[84], 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, 0x0U, K[86]};
					//SK02, SK06, SK06, SK06, SK06, SK06, SK06, SK06, SK06, SK06, SK06, SK03};
	//pad0^ K[82], pad1^ K[82], K[82], K[82]

	SHA256(pad, data, 1);

	for(uint i=0; i<8; i++)
		tmp[i] = pad[i];
	for(uint i=0; i<8; i++)
		pad[i] ^= K[82];
	for(uint i=8; i<16; i++)
		pad[i] = K[82];

	SHA256(ostate, pad, 0);

	for(uint i=0; i<8; i++)
		pad[i] = tmp[i]^ K[83];
	for(uint i=8; i<16; i++)
		pad[i] = K[83];

	SHA256(tstate, pad, 0);

	//save the tstate
	for(uint i=0; i<8; i++)
		tstatebak[i] = tstate[i];

	//copy input into tmp
	tmp[0] = input[0].x;
	tmp[1] = input[0].y;
	tmp[2] = input[0].z;
	tmp[3] = input[0].w;
	tmp[4] = input[1].x;
	tmp[5] = input[1].y;
	tmp[6] = input[1].z;
	tmp[7] = input[1].w;
	tmp[8] = input[2].x;
	tmp[9] = input[2].y;
	tmp[10] = input[2].z;
	tmp[11] = input[2].w;
	tmp[12] = input[3].x;
	tmp[13] = input[3].y;
	tmp[14] = input[3].z;
	tmp[15] = input[3].w;
	
	SHA256(tstate, tmp, 1);

	//these will stay the same below
	pad[8]  = K[84];
	pad[9]  = 0x0U;
	pad[10] = 0x0U;
	pad[11] = 0x0U;
	pad[12] = 0x0U;
	pad[13] = 0x0U;
	pad[14] = 0x0U;
	pad[15] = K[88];

	data[4] = 0x0U;
	data[5] = K[84];
	data[6] = 0x0U;
	data[7] = 0x0U;
	data[8] = 0x0U;
	data[9] = 0x0U;
	data[10] = 0x0U;
	data[11] = 0x0U;
	data[12] = 0x0U;
	data[13] = 0x0U;
	data[14] = 0x0U;
	data[15] = K[87];

	for(uint i=0; i<4; i++){
		pad[0] = tstate[0];
		pad[1] = tstate[1];
		pad[2] = tstate[2];
		pad[3] = tstate[3];
		pad[4] = tstate[4];
		pad[5] = tstate[5];
		pad[6] = tstate[6];
		pad[7] = tstate[7];

		tmp[0] = ostate[0];
		tmp[1] = ostate[1];
		tmp[2] = ostate[2];
		tmp[3] = ostate[3];
		tmp[4] = ostate[4];
		tmp[5] = ostate[5];
		tmp[6] = ostate[6];
		tmp[7] = ostate[7];
		data[4]++;
		SHA256(pad, data, 1);
		SHA256(tmp, pad, 1);
		//save to X
		uintp_to_uint4p(&X[i*2 ], &tmp[0]);
		uintp_to_uint4p(&X[i*2+1], &tmp[4]);
		//uintp_to_uint4p(X+(i<<1), tmp);
		//uintp_to_uint4p(X+(i<<1)+1, &tmp[4]);
	}

	scrypt_core(X,padcache);
	uint4_to_uintp(tmp,    X[0]);
	uint4_to_uintp(tmp+4,  X[1]);
	uint4_to_uintp(tmp+8,  X[2]);
	uint4_to_uintp(tmp+12, X[3]);
	SHA256(tstatebak,tmp, 1);
	uint4_to_uintp(tmp,    X[4]);
	uint4_to_uintp(tmp+4,  X[5]);
	uint4_to_uintp(tmp+8,  X[6]);
	uint4_to_uintp(tmp+12, X[7]);
	SHA256(tstatebak,tmp, 1);

	//SHA256_fixed(tstatebak,&tstatebak[4]);

#define A (tstatebak[0])
#define B (tstatebak[1])
#define C (tstatebak[2])
#define D (tstatebak[3])
#define E (tstatebak[4])
#define F (tstatebak[5])
#define G (tstatebak[6])
#define H (tstatebak[7])
    for(uint i=0; i<8; i++){
        RND(A,B,C,D,E,F,G,H, fixedWa[i]);
        RND(H,A,B,C,D,E,F,G, fixedWb[i]);
        RND(G,H,A,B,C,D,E,F, fixedWc[i]);
        RND(F,G,H,A,B,C,D,E, fixedWd[i]);
        RND(E,F,G,H,A,B,C,D, fixedWe[i]);
        RND(D,E,F,G,H,A,B,C, fixedWf[i]);
        RND(C,D,E,F,G,H,A,B, fixedWg[i]);
        RND(B,C,D,E,F,G,H,A, fixedWh[i]);
    }
#undef A
#undef B
#undef C
#undef D
#undef E
#undef F
#undef G
#undef H

	for(uint i=0; i<8; i++)
		tmp[i] = tstatebak[i];
	tmp[8] = K[84];
	tmp[9] = 0x0U;
	tmp[10] = 0x0U;
	tmp[11] = 0x0U;
	tmp[12] = 0x0U;
	tmp[13] = 0x0U;
	tmp[14] = 0x0U;
	tmp[15] = K[88];

	SHA256(ostate,tmp, 1);

    bool result = (EndianSwapa((ostate[7])) <= target);
    if (result)
        SETFOUND(gid);

}

