/*-
 * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt,
 * 2012-2013 Con Kolivas.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * This file was originally written by Colin Percival as part of the Tarsnap
 * online backup system.
 *
 * V1.3 modified by sterling pickens linuxsociety.org 2014
 */

/* N (nfactor), CPU/Memory cost parameter */
__constant uint N[] = {
	0x00000001U,  /* never used, padding */
	0x00000002U,
	0x00000004U,
	0x00000008U,
	0x00000010U,
	0x00000020U,
	0x00000040U,
	0x00000080U,
	0x00000100U,
	0x00000200U,
	0x00000400U,  /* 2^10 == 1024, Litecoin scrypt default */
	0x00000800U,
	0x00001000U,
	0x00002000U,
	0x00004000U,
	0x00008000U,
	0x00010000U,
	0x00020000U,
	0x00040000U,
	0x00080000U,
	0x00100000U
};

/* Backwards compatibility, if NFACTOR not defined, default to 10 for scrypt */
#ifndef NFACTOR
#define NFACTOR 10
#endif

__constant uint ES[2] = { 0x00FF00FF, 0xFF00FF00 };


__constant uint K[] = {
	0x428a2f98U,
	0x71374491U,
	0xb5c0fbcfU,
	0xe9b5dba5U,
	0x3956c25bU,
	0x59f111f1U,
	0x923f82a4U,
	0xab1c5ed5U,
	0xd807aa98U,
	0x12835b01U,
	0x243185beU, // 10
	0x550c7dc3U,
	0x72be5d74U,
	0x80deb1feU,
	0x9bdc06a7U,
	0xe49b69c1U,
	0xefbe4786U,
	0x0fc19dc6U,
	0x240ca1ccU,
	0x2de92c6fU,
	0x4a7484aaU, // 20
	0x5cb0a9dcU,
	0x76f988daU,
	0x983e5152U,
	0xa831c66dU,
	0xb00327c8U,
	0xbf597fc7U,
	0xc6e00bf3U,
	0xd5a79147U,
	0x06ca6351U,
	0x14292967U, // 30
	0x27b70a85U,
	0x2e1b2138U,
	0x4d2c6dfcU,
	0x53380d13U,
	0x650a7354U,
	0x766a0abbU,
	0x81c2c92eU,
	0x92722c85U,
	0xa2bfe8a1U,
	0xa81a664bU, // 40
	0xc24b8b70U,
	0xc76c51a3U,
	0xd192e819U,
	0xd6990624U,
	0xf40e3585U,
	0x106aa070U,
	0x19a4c116U,
	0x1e376c08U,
	0x2748774cU,
	0x34b0bcb5U, // 50
	0x391c0cb3U,
	0x4ed8aa4aU,
	0x5b9cca4fU,
	0x682e6ff3U,
	0x748f82eeU,
	0x78a5636fU,
	0x84c87814U,
	0x8cc70208U,
	0x90befffaU,
	0xa4506cebU, // 60
	0xbef9a3f7U,
	0xc67178f2U,
	0x98c7e2a2U,
	0xfc08884dU,
	0xcd2a11aeU,
	0x510e527fU,
	0x9b05688cU,
	0xC3910C8EU,
	0xfb6feee7U,
	0x2a01a605U, // 70
	0x0c2e12e0U,
	0x4498517BU,
	0x6a09e667U,
	0xa4ce148bU,
	0x95F61999U,
	0xc19bf174U,
	0xBB67AE85U,
	0x3C6EF372U,
	0xA54FF53AU,
	0x1F83D9ABU, // 80
	0x5BE0CD19U
};


/*
	0x5C5C5C5CU,
	0x36363636U,
	0x80000000U,
//	0x000003FFU, //never used
	0x00000280U,
	0x000004a0U,
	0x00000300U
};
*/

#define FOURdeclare(myvar, v1, v2, v3, v4) uint myvar ## 1 = v1; \
                                           uint myvar ## 2 = v2; \
                                           uint myvar ## 3 = v3; \
                                           uint myvar ## 4 = v4;

#define FOURassign(myvar, v1, v2, v3, v4) myvar ## 1 = v1; \
                                          myvar ## 2 = v2; \
                                          myvar ## 3 = v3; \
                                          myvar ## 4 = v4;

#define FOURcopy(var1, var2) var1 ## 1 = var2 ## 1; \
                             var1 ## 2 = var2 ## 2; \
                             var1 ## 3 = var2 ## 3; \
                             var1 ## 4 = var2 ## 4;

#define FOURtovec(var1, var2) var1 .x = var2 ## 1; \
                              var1 .y = var2 ## 2; \
                              var1 .z = var2 ## 3; \
                              var1 .w = var2 ## 4;

#define FOURfromvec(var1, var2) var1 ## 1 = var2 .x; \
                                var1 ## 2 = var2 .y; \
                                var1 ## 3 = var2 .z; \
                                var1 ## 4 = var2 .w;

#define UNROLL_FACTOR 2


#define rotl(x,y) rotate(x,y)
#define Ch(x,y,z) bitselect(z,y,x)
#define Maj(x,y,z) Ch((x^z),y,z)

#define EndianSwapa(n) (Ch(E0, rotl(n, 8U), rotl(n, 24U)))
#define EndianSwapb(n) (rotl(n & E0, 24U)|rotl(n & E1, 8U))

#define Tr2(x)		(rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U))
#define Tr1(x)		(rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U))
#define Wr2(x)		(rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U))
#define Wr1(x)		(rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U))

#define RND(a, b, c, d, e, f, g, h, k)  \
	h += Tr1(e); 			\
	h += Ch(e, f, g); 		\
	h += k;				\
	d += h;				\
	h += Tr2(a); 			\
	h += Maj(a, b, c);

void SHA256(uint4 *restrict state0,uint4 *restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3, bool notfresh){

	
#define A S0a
#define B S0b
#define C S0c
#define D S0d
#define E S1a
#define F S1b
#define G S1c
#define H S1d

#define Wx a1
#define Wy a2
#define Wz a3
#define Ww a4

#define Xx b1
#define Xy b2
#define Xz b3 
#define Xw b4

#define Yx c1
#define Yy c2
#define Yz c3 
#define Yw c4

#define Zx d1
#define Zy d2
#define Zz d3 
#define Zw d4

	uint4 tmp0 = *state0;
	uint4 tmp1 = *state1;

	uint a1 = block0.x;
	uint b1 = block1.x;
	uint c1 = block2.x;
	uint d1 = block3.x;
	uint S0a = tmp0.x;
	uint S1a = tmp1.x;

	uint a2 = block0.y;
	uint b2 = block1.y;
	uint c2 = block2.y;
	uint d2 = block3.y;
	uint S0b = tmp0.y;
	uint S1b = tmp1.y;

	uint a3 = block0.z;
	uint b3 = block1.z;
	uint c3 = block2.z;
	uint d3 = block3.z;
	uint S0c = tmp0.z;
	uint S1c = tmp1.z;

	uint a4 = block0.w;
	uint b4 = block1.w;
	uint c4 = block2.w;
	uint d4 = block3.w;
	uint S0d = tmp0.w;
	uint S1d = tmp1.w;

	uint K0, K1, K2, K3, K4, K5, K6, K7;

	if(notfresh){
		K0 = K[0];
		K1 = K[1];
		K2 = K[2];
		K3 = K[3];
		RND(A,B,C,D,E,F,G,H, K0+Wx);
		RND(H,A,B,C,D,E,F,G, K1+Wy);
		RND(G,H,A,B,C,D,E,F, K2+Wz);
		RND(F,G,H,A,B,C,D,E, K3+Ww);
	}else{
		K0 = K[63];
		K1 = K[64];
		K2 = K[65];
		K3 = K[66];
		K4 = K[67];
		K5 = K[68];
		K6 = K[69];
		K7 = K[70];

		D= K0 +Wx;
		H= K1 +Wx;
		C= K2 +Tr1(D)+Ch(D, K3, K4)+Wy;
		G= K5 +C+Tr2(H)+Ch(H, K6 ,K7);

		tmp1.x = K3;
		tmp1.y = K4;

		K4 = K[71];
		K5 = K[72];
		K6 = K[73];
		K7 = K[74];
		K0 = K[75];

		B= K4 +Tr1(C)+Ch(C,D,K3)+Wz;
		F= K5 +B+Tr2(G)+Maj(G,H, K6);
		A= K7 +Tr1(B)+Ch(B,C,D)+Ww;
		E= K0 +A+Tr2(F)+Maj(F,G,H);

		tmp0.x = K6;
		tmp0.y = K[77];
		tmp0.z = K[78];
		tmp0.w = K[79];
		tmp1.z = K[80];
		tmp1.w = K[81];
	}

	K0 = K[4];
	K1 = K[5];
	K2 = K[6];
	K3 = K[7];
	RND(E,F,G,H,A,B,C,D,K0+Xx);
	RND(D,E,F,G,H,A,B,C,K1+Xy);
	RND(C,D,E,F,G,H,A,B,K2+Xz);
	RND(B,C,D,E,F,G,H,A,K3+Xw);
	K4 = K[8];
	K5 = K[9];
	K6 = K[10];
	K7 = K[11];
	RND(A,B,C,D,E,F,G,H,K4+Yx);
	RND(H,A,B,C,D,E,F,G,K5+Yy);
	RND(G,H,A,B,C,D,E,F,K6+Yz);
	RND(F,G,H,A,B,C,D,E,K7+Yw);
	K7 = K[76];
	K0 = K[12];
	K1 = K[13];
	K2 = K[14];
	RND(E,F,G,H,A,B,C,D,K0+Zx);
	RND(D,E,F,G,H,A,B,C,K1+Zy);
	RND(C,D,E,F,G,H,A,B,K2+Zz);
	RND(B,C,D,E,F,G,H,A,K7+Zw);
	K3 = K[15];
	K4 = K[16];
	K5 = K[17];
	K6 = K[18];
	Wx += Wr1(Zz) + Yy + Wr2(Wy);
	RND(A,B,C,D,E,F,G,H, Wx+ K3);
	Wy += Wr1(Zw) + Yz + Wr2(Wz);
	RND(H,A,B,C,D,E,F,G, Wy+ K4);
	Wz += Wr1(Wx) + Yw + Wr2(Ww);
	RND(G,H,A,B,C,D,E,F, Wz+ K5);
	Ww += Wr1(Wy) + Zx + Wr2(Xx);
	RND(F,G,H,A,B,C,D,E, Ww+ K6);
	K0 = K[19];
	K1 = K[20];
	K2 = K[21];
	K3 = K[22];
	Xx += Wr1(Wz) + Zy + Wr2(Xy);
	RND(E,F,G,H,A,B,C,D, Xx+ K0);
	Xy += Wr1(Ww) + Zz + Wr2(Xz);
	RND(D,E,F,G,H,A,B,C, Xy+ K1);
	Xz += Wr1(Xx) + Zw + Wr2(Xw);
	RND(C,D,E,F,G,H,A,B, Xz+ K2);
	Xw += Wr1(Xy) + Wx + Wr2(Yx);
	RND(B,C,D,E,F,G,H,A, Xw+ K3);
	K4 = K[23];
	K5 = K[24];
	K6 = K[25];
	K7 = K[26];
	Yx += Wr1(Xz) + Wy + Wr2(Yy);
	RND(A,B,C,D,E,F,G,H, Yx+ K4);
	Yy += Wr1(Xw) + Wz + Wr2(Yz);
	RND(H,A,B,C,D,E,F,G, Yy+ K5);
	Yz += Wr1(Yx) + Ww + Wr2(Yw);
	RND(G,H,A,B,C,D,E,F, Yz+ K6);
	Yw += Wr1(Yy) + Xx + Wr2(Zx);
	RND(F,G,H,A,B,C,D,E, Yw+ K7);
	K0 = K[27];
	K1 = K[28];
	K2 = K[29];
	K3 = K[30];
	Zx += Wr1(Yz) + Xy + Wr2(Zy);
	RND(E,F,G,H,A,B,C,D, Zx+ K0);
	Zy += Wr1(Yw) + Xz + Wr2(Zz);
	RND(D,E,F,G,H,A,B,C, Zy+ K1);
	Zz += Wr1(Zx) + Xw + Wr2(Zw);
	RND(C,D,E,F,G,H,A,B, Zz+ K2);
	Zw += Wr1(Zy) + Yx + Wr2(Wx);
	RND(B,C,D,E,F,G,H,A, Zw+ K3);
	K4 = K[31];
	K5 = K[32];
	K6 = K[33];
	K7 = K[34];
	Wx += Wr1(Zz) + Yy + Wr2(Wy);
	RND(A,B,C,D,E,F,G,H, Wx+ K4);
	Wy += Wr1(Zw) + Yz + Wr2(Wz);
	RND(H,A,B,C,D,E,F,G, Wy+ K5);
	Wz += Wr1(Wx) + Yw + Wr2(Ww);
	RND(G,H,A,B,C,D,E,F, Wz+ K6);
	Ww += Wr1(Wy) + Zx + Wr2(Xx);
	RND(F,G,H,A,B,C,D,E, Ww+ K7);
	K0 = K[35];
	K1 = K[36];
	K2 = K[37];
	K3 = K[38];
	Xx += Wr1(Wz) + Zy + Wr2(Xy);
	RND(E,F,G,H,A,B,C,D, Xx+ K0);
	Xy += Wr1(Ww) + Zz + Wr2(Xz);
	RND(D,E,F,G,H,A,B,C, Xy+ K1);
	Xz += Wr1(Xx) + Zw + Wr2(Xw);
	RND(C,D,E,F,G,H,A,B, Xz+ K2);
	Xw += Wr1(Xy) + Wx + Wr2(Yx);
	RND(B,C,D,E,F,G,H,A, Xw+ K3);
	K4 = K[39];
	K5 = K[40];
	K6 = K[41];
	K7 = K[42];
	Yx += Wr1(Xz) + Wy + Wr2(Yy);
	RND(A,B,C,D,E,F,G,H, Yx+ K4);
	Yy += Wr1(Xw) + Wz + Wr2(Yz);
	RND(H,A,B,C,D,E,F,G, Yy+ K5);
	Yz += Wr1(Yx) + Ww + Wr2(Yw);
	RND(G,H,A,B,C,D,E,F, Yz+ K6);
	Yw += Wr1(Yy) + Xx + Wr2(Zx);
	RND(F,G,H,A,B,C,D,E, Yw+ K7);
	K0 = K[43];
	K1 = K[44];
	K2 = K[45];
	K3 = K[46];
	Zx += Wr1(Yz) + Xy + Wr2(Zy);
	RND(E,F,G,H,A,B,C,D, Zx+ K0);
	Zy += Wr1(Yw) + Xz + Wr2(Zz);
	RND(D,E,F,G,H,A,B,C, Zy+ K1);
	Zz += Wr1(Zx) + Xw + Wr2(Zw);
	RND(C,D,E,F,G,H,A,B, Zz+ K2);
	Zw += Wr1(Zy) + Yx + Wr2(Wx);
	RND(B,C,D,E,F,G,H,A, Zw+ K3);
	K4 = K[47];
	K5 = K[48];
	K6 = K[49];
	K7 = K[50];
	Wx += Wr1(Zz) + Yy + Wr2(Wy);
	RND(A,B,C,D,E,F,G,H, Wx+ K4);
	Wy += Wr1(Zw) + Yz + Wr2(Wz);
	RND(H,A,B,C,D,E,F,G, Wy+ K5);
	Wz += Wr1(Wx) + Yw + Wr2(Ww);
	RND(G,H,A,B,C,D,E,F, Wz+ K6);
	Ww += Wr1(Wy) + Zx + Wr2(Xx);
	RND(F,G,H,A,B,C,D,E, Ww+ K7);
	K0 = K[51];
	K1 = K[52];
	K2 = K[53];
	K3 = K[54];
	Xx += Wr1(Wz) + Zy + Wr2(Xy);
	RND(E,F,G,H,A,B,C,D, Xx+ K0);
	Xy += Wr1(Ww) + Zz + Wr2(Xz);
	RND(D,E,F,G,H,A,B,C, Xy+ K1);
	Xz += Wr1(Xx) + Zw + Wr2(Xw);
	RND(C,D,E,F,G,H,A,B, Xz+ K2);
	Xw += Wr1(Xy) + Wx + Wr2(Yx);
	RND(B,C,D,E,F,G,H,A, Xw+ K3);
	K4 = K[55];
	K5 = K[56];
	K6 = K[57];
	K7 = K[58];
	Yx += Wr1(Xz) + Wy + Wr2(Yy);
	RND(A,B,C,D,E,F,G,H, Yx+ K4);
	Yy += Wr1(Xw) + Wz + Wr2(Yz);
	RND(H,A,B,C,D,E,F,G, Yy+ K5);
	Yz += Wr1(Yx) + Ww + Wr2(Yw);
	RND(G,H,A,B,C,D,E,F, Yz+ K6);
	Yw += Wr1(Yy) + Xx + Wr2(Zx);
	RND(F,G,H,A,B,C,D,E, Yw+ K7);
	K4 = K[59];
	K5 = K[60];
	K6 = K[61];
	K7 = K[62];
	Zx += Wr1(Yz) + Xy + Wr2(Zy);
	RND(E,F,G,H,A,B,C,D, Zx+ K4);
	Zy += Wr1(Yw) + Xz + Wr2(Zz);
	RND(D,E,F,G,H,A,B,C, Zy+ K5);
	Zz += Wr1(Zx) + Xw + Wr2(Zw);
	RND(C,D,E,F,G,H,A,B, Zz+ K6);
	Zw += Wr1(Zy) + Yx + Wr2(Wx);
	RND(B,C,D,E,F,G,H,A, Zw+ K7);
	
#undef A
#undef B
#undef C
#undef D
#undef E
#undef F
#undef G
#undef H

#undef Wx
#undef Wy
#undef Wz
#undef Ww

#undef Xx
#undef Xy
#undef Xz
#undef Xw

#undef Yx
#undef Yy
#undef Yz
#undef Yw

#undef Zx
#undef Zy
#undef Zz
#undef Zw

	tmp0 += (uint4)(S0a, S0b, S0c, S0d);
	tmp1 += (uint4)(S1a, S1b, S1c, S1d);

	*state0 = tmp0;
	*state1 = tmp1;
}

void halfsalsa(uint4 *w){
	for(uint i=0; i<4; ++i){
		w[0] ^= rotl(w[3]     +w[2]     , 7U);
		w[1] ^= rotl(w[0]     +w[3]     , 9U);
		w[2] ^= rotl(w[1]     +w[0]     ,13U);
		w[3] ^= rotl(w[2]     +w[1]     ,18U);
		w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
		w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
		w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
		w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
	}
}

#if (LOOKUP_GAP == 2)
void salsa(uint4 *B, bool db){
#else
void salsa(uint4 *B){
#endif
    uint4 w[4];

	for(uint i=0; i<4; ++i)
		w[i] = (B[i]^=B[i+4]);
	halfsalsa(w);
	for(uint i=0; i<4; ++i)
		w[i] = (B[i+4]^=(B[i]+=w[i]));
	halfsalsa(w);
#if (LOOKUP_GAP == 2)
	if(db){
		for(uint i=0; i<4; ++i)
			w[i] = (B[i]^=(B[i+4]+=w[i]));
		halfsalsa(w);
		for(uint i=0; i<4; ++i)
			w[i] = (B[i+4]^=(B[i]+=w[i]));
		halfsalsa(w);
	}
#endif
	for(uint i=0; i<4; ++i)
		B[i+4] += w[i];
}

#define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE)
#define CO Coord(z,x,y)

void scrypt_core(uint4 *X, __global uint4 *restrict lookup){
	const uint zSIZE = 8;
	const uint ySIZE = (N[NFACTOR]/LOOKUP_GAP+(N[NFACTOR]%LOOKUP_GAP>0));
	const uint xSIZE = CONCURRENT_THREADS;
	uint x = get_global_id(0)%xSIZE;
	uint4 tmp[4];
	uint E0 = ES[0];
	uint E1 = ES[1];

	tmp[0] = (uint4)(X[1].x,X[2].y,X[3].z,X[0].w);
	tmp[1] = (uint4)(X[2].x,X[3].y,X[0].z,X[1].w);
	tmp[2] = (uint4)(X[3].x,X[0].y,X[1].z,X[2].w);
	tmp[3] = (uint4)(X[0].x,X[1].y,X[2].z,X[3].w);

	X[0] = EndianSwapa(tmp[0]);
	X[1] = EndianSwapb(tmp[1]);
	X[2] = EndianSwapb(tmp[2]);
	X[3] = EndianSwapb(tmp[3]);

	tmp[0] = (uint4)(X[5].x,X[6].y,X[7].z,X[4].w);
	tmp[1] = (uint4)(X[6].x,X[7].y,X[4].z,X[5].w);
	tmp[2] = (uint4)(X[7].x,X[4].y,X[5].z,X[6].w);
	tmp[3] = (uint4)(X[4].x,X[5].y,X[6].z,X[7].w);

	X[4] = EndianSwapa(tmp[0]);
	X[5] = EndianSwapb(tmp[1]);
	X[6] = EndianSwapb(tmp[2]);
	X[7] = EndianSwapb(tmp[3]);

	for(uint y=0; y<(N[NFACTOR]/LOOKUP_GAP); ++y){
		for(uint z=0; z<zSIZE; ++z)
			lookup[CO] = X[z];

#if (LOOKUP_GAP == 2)
		salsa(X, 1);
#elif (LOOKUP_GAP == 1)
		salsa(X);
#else
		for(uint i=0; i<LOOKUP_GAP; ++i)
			salsa(X);
#endif
	}
#if (LOOKUP_GAP != 1) && (LOOKUP_GAP != 2) && (LOOKUP_GAP != 4) && (LOOKUP_GAP != 8)
        uint y = (N[NFACTOR]/LOOKUP_GAP);
        for(uint z=0; z<zSIZE; ++z)
            lookup[CO] = X[z];
        for(uint i=0; i<N[NFACTOR]%LOOKUP_GAP; ++i)
            salsa(X);
#endif

#if (LOOKUP_GAP != 1)
    for (uint i=0; i<N[NFACTOR]; ++i){
        uint j = X[7].x & (N[NFACTOR]-1);
#else
	for (uint i=0; i<N[NFACTOR]; ++i){
		uint y = X[7].x & (N[NFACTOR]-1);
#endif

#if (LOOKUP_GAP == 1)
		
#elif (LOOKUP_GAP == 2)
		uint y = (j>>1);
#elif (LOOKUP_GAP == 4)
		uint y = (j>>2);
#elif (LOOKUP_GAP == 8)
		uint y = (j>>3);
#else
		uint y = (j/LOOKUP_GAP);
#endif

#if (LOOKUP_GAP != 2) && (LOOKUP_GAP != 1)
		uint4 V[8];
		for(uint z=0; z<zSIZE; ++z)
			V[z] = lookup[CO];
#endif

#if (LOOKUP_GAP == 1)
		for(uint z=0; z<zSIZE; ++z)
			X[z] ^= lookup[CO];
#elif (LOOKUP_GAP == 2)
		if(j&1){
			uint4 V[8];
			for(uint z=0; z<zSIZE; ++z)
				V[z] = lookup[CO];
			salsa(V, 0);
			for(uint z=0; z<zSIZE; ++z)
				X[z] ^= V[z];
		}else{
			for(uint z=0; z<zSIZE; ++z)
				X[z] ^= lookup[CO];
		}
#else
		uint val = j%LOOKUP_GAP;
		for (uint z=0; z<val; ++z)
			salsa(V);
#endif

#if (LOOKUP_GAP != 2) && (LOOKUP_GAP != 1)
		for(uint z=0; z<zSIZE; ++z)
			X[z] ^= V[z];
#endif

#if (LOOKUP_GAP == 2)
        salsa(X, 0);
#else
        salsa(X);
#endif
    }

	tmp[0] = (uint4)(X[3].x,X[2].y,X[1].z,X[0].w);
	tmp[1] = (uint4)(X[0].x,X[3].y,X[2].z,X[1].w);
	tmp[2] = (uint4)(X[1].x,X[0].y,X[3].z,X[2].w);
	tmp[3] = (uint4)(X[2].x,X[1].y,X[0].z,X[3].w);

	X[0] = EndianSwapa(tmp[0]);
	X[1] = EndianSwapb(tmp[1]);
	X[2] = EndianSwapb(tmp[2]);
	X[3] = EndianSwapb(tmp[3]);

	tmp[0] = (uint4)(X[7].x,X[6].y,X[5].z,X[4].w);
	tmp[1] = (uint4)(X[4].x,X[7].y,X[6].z,X[5].w);
	tmp[2] = (uint4)(X[5].x,X[4].y,X[7].z,X[6].w);
	tmp[3] = (uint4)(X[6].x,X[5].y,X[4].z,X[7].w);

	X[4] = EndianSwapa(tmp[0]);
	X[5] = EndianSwapb(tmp[1]);
	X[6] = EndianSwapb(tmp[2]);
	X[7] = EndianSwapb(tmp[3]);
}

__constant uint fixedWa[8] = {0x428a2f99,0xd807aa98,0xf59b89c2,0xb707775c,0xad87a3ea,0xc91b1417,0xe64fb6a2,0xe0a1adbe};
__constant uint fixedWb[8] = {0xf1374491,0x12835b01,0x73924787,0x0468c23f,0xbcb1d3a3,0xc359dce1,0xe84d923a,0x7c728e11};
__constant uint fixedWc[8] = {0xb5c0fbcf,0x243185be,0x23c6886e,0xe7e72b4c,0x7b993186,0xa83253a7,0xe93a5730,0x511c78e4};
__constant uint fixedWd[8] = {0xe9b5dba5,0x550c7dc3,0xa42ca65c,0x49e1f1a2,0x562b9420,0x3b13c12d,0x09837686,0x315b45bd};
__constant uint fixedWe[8] = {0x3956c25b,0x72be5d74,0x15ed3627,0x4b99c816,0xbff3ca0c,0x9d3d725d,0x078ff753,0xfca71413};
__constant uint fixedWf[8] = {0x59f111f1,0x80deb1fe,0x4d6edcbf,0x926d1570,0xda4b0c23,0xd9031a84,0x29833341,0xea28f96a};
__constant uint fixedWg[8] = {0x923f82a4,0x9bdc06a7,0xe28217fc,0xaa0fc072,0x6cd8711a,0xb1a03340,0xd5de0b7e,0x79703128};
__constant uint fixedWh[8] = {0xab1c5ed5,0xc19bf794,0xef02488f,0xadb36e2c,0x8f337caa,0x16f58012,0x6948ccf4,0x4e1ef848};
__constant uint sK[6] = {
    0x5C5C5C5CU, //82
    0x36363636U,
    0x80000000U,
//  0x000003FFU, //never used
    0x00000280U,
    0x000004a0U,
    0x00000300U
};

#define FOUND (0xFF)
#define SETFOUND(Xnonce) output[output[FOUND]++] = Xnonce

__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(__global const uint4 *restrict input,
volatile __global uint *restrict output, __global uint4 *restrict padcache, const uint4 midstate0, const uint4 midstate16, const uint target){

	uint gid = get_global_id(0);
	uint4 X[8];
	uint4 tmp0, tmp1;
	//uint4 tstate0, tstate1, ostate0, ostate1, tmp0, tmp1;
	//uint4 data = (uint4)(input[4].x,input[4].y,input[4].z,gid);
	//uint4 pad0 = midstate0, pad1 = midstate16;

	uint4 pass0, pass1, pass2;

	FOURdeclare(tstate0, 0, 0, 0, 0);
	FOURdeclare(tstate1, 0, 0, 0, 0);
	FOURdeclare(ostate0, 0, 0, 0, 0);
	FOURdeclare(ostate1, 0, 0, 0, 0);
	FOURdeclare(data, input[4].x,input[4].y,input[4].z,gid);

	//FOURdeclare(data, input[12], input[13], input[14], gid);
	//FOURdeclare(pad0, midstate0[0], midstate0[1], midstate0[2], midstate0[3]);
	//FOURdeclare(pad1, midstate16[0], midstate16[1], midstate16[2], midstate16[3]);

	FOURdeclare(pad0, midstate0.x, midstate0.y, midstate0.z, midstate0.w);
	FOURdeclare(pad1, midstate16.x, midstate16.y, midstate16.z, midstate16.w);

	FOURtovec(pass0, pad0);
	FOURtovec(pass1, pad1);
	FOURtovec(pass2, data);

	SHA256(&pass0,&pass1, pass2, (uint4)(sK[2],0,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, sK[3]), 1);
	FOURfromvec(pad0, pass0);
	FOURfromvec(pad1, pass1);
	//FOURfromvec(data, pass2);

    //FOURtovec(pass0, ostate0);
    //FOURtovec(pass1, ostate1);
	tmp0 = pass0^sK[0];
	tmp1 = pass1^sK[0];
	FOURtovec(pass0, ostate0);
	FOURtovec(pass1, ostate1);
	SHA256(&pass0, &pass1, tmp0, tmp1, sK[0], sK[0], 0);
    FOURfromvec(ostate0, pass0);
    FOURfromvec(ostate1, pass1);

	FOURtovec(pass0, pad0);
	FOURtovec(pass1, pad1);
	tmp0 = pass0^sK[1];
	tmp1 = pass1^sK[1];
	FOURtovec(pass0, tstate0);
	FOURtovec(pass1, tstate1);
	SHA256(&pass0, &pass1, tmp0, tmp1, sK[1], sK[1], 0);
	FOURfromvec(tstate0, pass0);
	FOURfromvec(tstate1, pass1);

	tmp0 = pass0;
	tmp1 = pass1;

	//FOURtovec(pass0, tstate0);
	//FOURtovec(pass1, tstate1);
	SHA256(&pass0, &pass1, input[0],input[1],input[2],input[3], 1);
	FOURfromvec(tstate0, pass0);
	FOURfromvec(tstate1, pass1);

/*

    SHA256(&pad0,&pad1, data, (uint4)(sK[2],0,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, sK[3]), 1);
    SHA256(&ostate0,&ostate1, pad0^sK[0], pad1^sK[0], sK[0], sK[0], 0);
    SHA256(&tstate0,&tstate1, pad0^sK[1], pad1^sK[1], sK[1], sK[1], 0);

    tmp0 = tstate0;
    tmp1 = tstate1;
    SHA256(&tstate0, &tstate1, input[0],input[1],input[2],input[3], 1);


	for (uint i=0; i<4; i++){
		pad0 = tstate0;
		pad1 = tstate1;
		X[(i<<1) ] = ostate0;
		X[(i<<1)+1] = ostate1;
		SHA256(&pad0, &pad1, data, (uint4)(i+1,sK[2],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, sK[4]), 1);
		SHA256(X+(i<<1),X+(i<<1)+1, pad0, pad1, (uint4)(sK[2], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, sK[5]), 1);
	}
*/

	for (uint i=0; i<4; i++){
		FOURcopy(pad0, tstate0);
		FOURcopy(pad1, tstate1);

		//FOURtovec( X[(i<<1) ], ostate0);
		//FOURtovec( X[(i<<1)+1], ostate1);
		FOURtovec(pass0, ostate0);
		FOURtovec(pass1, ostate1);
		X[(i<<1) ] = pass0;
		X[(i<<1)+1] = pass1;

		FOURtovec(pass0, pad0);
		FOURtovec(pass1, pad1);
		FOURtovec(pass2, data);
		SHA256(&pass0, &pass1, pass2, (uint4)(i+1,sK[2],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, sK[4]), 1);
		FOURfromvec(pad0, pass0);
		FOURfromvec(pad1, pass1);
		SHA256(X+(i<<1),X+(i<<1)+1, pass0, pass1, (uint4)(sK[2], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, sK[5]), 1);
	}


	scrypt_core(X,padcache);

	SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3], 1);
	SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7], 1);

	//tstate0 = tmp0;
	//tstate1 = tmp1;
	FOURfromvec(tstate0, tmp0);
	FOURfromvec(tstate1, tmp1);

#define A tstate01
#define B tstate02
#define C tstate03
#define D tstate04
#define E tstate11
#define F tstate12
#define G tstate13
#define H tstate14

#pragma unroll UNROLL_FACTOR
	for(uint i=0; i<8; i++){
		RND(A,B,C,D,E,F,G,H, fixedWa[i]);
		RND(H,A,B,C,D,E,F,G, fixedWb[i]);
		RND(G,H,A,B,C,D,E,F, fixedWc[i]);
		RND(F,G,H,A,B,C,D,E, fixedWd[i]);
		RND(E,F,G,H,A,B,C,D, fixedWe[i]);
		RND(D,E,F,G,H,A,B,C, fixedWf[i]);
		RND(C,D,E,F,G,H,A,B, fixedWg[i]);
		RND(B,C,D,E,F,G,H,A, fixedWh[i]);
	}

#undef A
#undef B
#undef C
#undef D
#undef E
#undef F
#undef G
#undef H

	FOURtovec(pass0, tstate0);
	FOURtovec(pass1, tstate1);

	tmp0 += pass0;
	tmp1 += pass1;

	FOURtovec(pass2, ostate0);
	FOURtovec(pass0, ostate1);
	SHA256(&pass2,&pass0, tmp0, tmp1, (uint4)(sK[2], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, sK[5]), 1);

	//FOURfromvec(ostate0, pass2);
	FOURfromvec(ostate1, pass0);
    uint E0 = ES[0];
	bool result = (EndianSwapa(ostate14) <= target);
	if (result)
		SETFOUND(gid);
}
