/*-
 * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt,
 * 2012-2013 Con Kolivas.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * This file was originally written by Colin Percival as part of the Tarsnap
 * online backup system.
 *
 * V1.3 modified by sterling pickens linuxsociety.org 2014
 */

#define COORDINDEX(x,y,z) (x+idx[y]+z)
#define IDX COORDINDEX(x,y,z)

__constant uint idx[512] = {
		0*CONCURRENT_THREADS*8, 1*CONCURRENT_THREADS*8, 2*CONCURRENT_THREADS*8, 3*CONCURRENT_THREADS*8, 4*CONCURRENT_THREADS*8, 5*CONCURRENT_THREADS*8, 6*CONCURRENT_THREADS*8, 7*CONCURRENT_THREADS*8, 8*CONCURRENT_THREADS*8, 9*CONCURRENT_THREADS*8, 10*CONCURRENT_THREADS*8, 11*CONCURRENT_THREADS*8, 12*CONCURRENT_THREADS*8, 13*CONCURRENT_THREADS*8, 14*CONCURRENT_THREADS*8, 15*CONCURRENT_THREADS*8, 16*CONCURRENT_THREADS*8, 17*CONCURRENT_THREADS*8, 18*CONCURRENT_THREADS*8, 19*CONCURRENT_THREADS*8, 20*CONCURRENT_THREADS*8, 21*CONCURRENT_THREADS*8, 22*CONCURRENT_THREADS*8, 23*CONCURRENT_THREADS*8, 24*CONCURRENT_THREADS*8, 25*CONCURRENT_THREADS*8, 26*CONCURRENT_THREADS*8, 27*CONCURRENT_THREADS*8, 28*CONCURRENT_THREADS*8, 29*CONCURRENT_THREADS*8, 30*CONCURRENT_THREADS*8, 
		31*CONCURRENT_THREADS*8, 32*CONCURRENT_THREADS*8, 33*CONCURRENT_THREADS*8, 34*CONCURRENT_THREADS*8, 35*CONCURRENT_THREADS*8, 36*CONCURRENT_THREADS*8, 37*CONCURRENT_THREADS*8, 38*CONCURRENT_THREADS*8, 39*CONCURRENT_THREADS*8, 40*CONCURRENT_THREADS*8, 41*CONCURRENT_THREADS*8, 42*CONCURRENT_THREADS*8, 43*CONCURRENT_THREADS*8, 44*CONCURRENT_THREADS*8, 45*CONCURRENT_THREADS*8, 46*CONCURRENT_THREADS*8, 47*CONCURRENT_THREADS*8, 48*CONCURRENT_THREADS*8, 49*CONCURRENT_THREADS*8, 50*CONCURRENT_THREADS*8, 51*CONCURRENT_THREADS*8, 52*CONCURRENT_THREADS*8, 53*CONCURRENT_THREADS*8, 54*CONCURRENT_THREADS*8, 55*CONCURRENT_THREADS*8, 56*CONCURRENT_THREADS*8, 57*CONCURRENT_THREADS*8, 58*CONCURRENT_THREADS*8, 59*CONCURRENT_THREADS*8, 60*CONCURRENT_THREADS*8, 61*CONCURRENT_THREADS*8, 62*CONCURRENT_THREADS*8, 
		63*CONCURRENT_THREADS*8, 64*CONCURRENT_THREADS*8, 65*CONCURRENT_THREADS*8, 66*CONCURRENT_THREADS*8, 67*CONCURRENT_THREADS*8, 68*CONCURRENT_THREADS*8, 69*CONCURRENT_THREADS*8, 70*CONCURRENT_THREADS*8, 71*CONCURRENT_THREADS*8, 72*CONCURRENT_THREADS*8, 73*CONCURRENT_THREADS*8, 74*CONCURRENT_THREADS*8, 75*CONCURRENT_THREADS*8, 76*CONCURRENT_THREADS*8, 77*CONCURRENT_THREADS*8, 78*CONCURRENT_THREADS*8, 79*CONCURRENT_THREADS*8, 80*CONCURRENT_THREADS*8, 81*CONCURRENT_THREADS*8, 82*CONCURRENT_THREADS*8, 83*CONCURRENT_THREADS*8, 84*CONCURRENT_THREADS*8, 85*CONCURRENT_THREADS*8, 86*CONCURRENT_THREADS*8, 87*CONCURRENT_THREADS*8, 88*CONCURRENT_THREADS*8, 89*CONCURRENT_THREADS*8, 90*CONCURRENT_THREADS*8, 91*CONCURRENT_THREADS*8, 92*CONCURRENT_THREADS*8, 93*CONCURRENT_THREADS*8, 94*CONCURRENT_THREADS*8, 
		95*CONCURRENT_THREADS*8, 96*CONCURRENT_THREADS*8, 97*CONCURRENT_THREADS*8, 98*CONCURRENT_THREADS*8, 99*CONCURRENT_THREADS*8, 100*CONCURRENT_THREADS*8, 101*CONCURRENT_THREADS*8, 102*CONCURRENT_THREADS*8, 103*CONCURRENT_THREADS*8, 104*CONCURRENT_THREADS*8, 105*CONCURRENT_THREADS*8, 106*CONCURRENT_THREADS*8, 107*CONCURRENT_THREADS*8, 108*CONCURRENT_THREADS*8, 109*CONCURRENT_THREADS*8, 110*CONCURRENT_THREADS*8, 111*CONCURRENT_THREADS*8, 112*CONCURRENT_THREADS*8, 113*CONCURRENT_THREADS*8, 114*CONCURRENT_THREADS*8, 115*CONCURRENT_THREADS*8, 116*CONCURRENT_THREADS*8, 117*CONCURRENT_THREADS*8, 118*CONCURRENT_THREADS*8, 119*CONCURRENT_THREADS*8, 120*CONCURRENT_THREADS*8, 121*CONCURRENT_THREADS*8, 122*CONCURRENT_THREADS*8, 123*CONCURRENT_THREADS*8, 124*CONCURRENT_THREADS*8, 125*CONCURRENT_THREADS*8, 126*CONCURRENT_THREADS*8, 
		127*CONCURRENT_THREADS*8, 128*CONCURRENT_THREADS*8, 129*CONCURRENT_THREADS*8, 130*CONCURRENT_THREADS*8, 131*CONCURRENT_THREADS*8, 132*CONCURRENT_THREADS*8, 133*CONCURRENT_THREADS*8, 134*CONCURRENT_THREADS*8, 135*CONCURRENT_THREADS*8, 136*CONCURRENT_THREADS*8, 137*CONCURRENT_THREADS*8, 138*CONCURRENT_THREADS*8, 139*CONCURRENT_THREADS*8, 140*CONCURRENT_THREADS*8, 141*CONCURRENT_THREADS*8, 142*CONCURRENT_THREADS*8, 143*CONCURRENT_THREADS*8, 144*CONCURRENT_THREADS*8, 145*CONCURRENT_THREADS*8, 146*CONCURRENT_THREADS*8, 147*CONCURRENT_THREADS*8, 148*CONCURRENT_THREADS*8, 149*CONCURRENT_THREADS*8, 150*CONCURRENT_THREADS*8, 151*CONCURRENT_THREADS*8, 152*CONCURRENT_THREADS*8, 153*CONCURRENT_THREADS*8, 154*CONCURRENT_THREADS*8, 155*CONCURRENT_THREADS*8, 156*CONCURRENT_THREADS*8, 157*CONCURRENT_THREADS*8, 158*CONCURRENT_THREADS*8, 
		159*CONCURRENT_THREADS*8, 160*CONCURRENT_THREADS*8, 161*CONCURRENT_THREADS*8, 162*CONCURRENT_THREADS*8, 163*CONCURRENT_THREADS*8, 164*CONCURRENT_THREADS*8, 165*CONCURRENT_THREADS*8, 166*CONCURRENT_THREADS*8, 167*CONCURRENT_THREADS*8, 168*CONCURRENT_THREADS*8, 169*CONCURRENT_THREADS*8, 170*CONCURRENT_THREADS*8, 171*CONCURRENT_THREADS*8, 172*CONCURRENT_THREADS*8, 173*CONCURRENT_THREADS*8, 174*CONCURRENT_THREADS*8, 175*CONCURRENT_THREADS*8, 176*CONCURRENT_THREADS*8, 177*CONCURRENT_THREADS*8, 178*CONCURRENT_THREADS*8, 179*CONCURRENT_THREADS*8, 180*CONCURRENT_THREADS*8, 181*CONCURRENT_THREADS*8, 182*CONCURRENT_THREADS*8, 183*CONCURRENT_THREADS*8, 184*CONCURRENT_THREADS*8, 185*CONCURRENT_THREADS*8, 186*CONCURRENT_THREADS*8, 187*CONCURRENT_THREADS*8, 188*CONCURRENT_THREADS*8, 189*CONCURRENT_THREADS*8, 190*CONCURRENT_THREADS*8, 
		191*CONCURRENT_THREADS*8, 192*CONCURRENT_THREADS*8, 193*CONCURRENT_THREADS*8, 194*CONCURRENT_THREADS*8, 195*CONCURRENT_THREADS*8, 196*CONCURRENT_THREADS*8, 197*CONCURRENT_THREADS*8, 198*CONCURRENT_THREADS*8, 199*CONCURRENT_THREADS*8, 200*CONCURRENT_THREADS*8, 201*CONCURRENT_THREADS*8, 202*CONCURRENT_THREADS*8, 203*CONCURRENT_THREADS*8, 204*CONCURRENT_THREADS*8, 205*CONCURRENT_THREADS*8, 206*CONCURRENT_THREADS*8, 207*CONCURRENT_THREADS*8, 208*CONCURRENT_THREADS*8, 209*CONCURRENT_THREADS*8, 210*CONCURRENT_THREADS*8, 211*CONCURRENT_THREADS*8, 212*CONCURRENT_THREADS*8, 213*CONCURRENT_THREADS*8, 214*CONCURRENT_THREADS*8, 215*CONCURRENT_THREADS*8, 216*CONCURRENT_THREADS*8, 217*CONCURRENT_THREADS*8, 218*CONCURRENT_THREADS*8, 219*CONCURRENT_THREADS*8, 220*CONCURRENT_THREADS*8, 221*CONCURRENT_THREADS*8, 222*CONCURRENT_THREADS*8, 
		223*CONCURRENT_THREADS*8, 224*CONCURRENT_THREADS*8, 225*CONCURRENT_THREADS*8, 226*CONCURRENT_THREADS*8, 227*CONCURRENT_THREADS*8, 228*CONCURRENT_THREADS*8, 229*CONCURRENT_THREADS*8, 230*CONCURRENT_THREADS*8, 231*CONCURRENT_THREADS*8, 232*CONCURRENT_THREADS*8, 233*CONCURRENT_THREADS*8, 234*CONCURRENT_THREADS*8, 235*CONCURRENT_THREADS*8, 236*CONCURRENT_THREADS*8, 237*CONCURRENT_THREADS*8, 238*CONCURRENT_THREADS*8, 239*CONCURRENT_THREADS*8, 240*CONCURRENT_THREADS*8, 241*CONCURRENT_THREADS*8, 242*CONCURRENT_THREADS*8, 243*CONCURRENT_THREADS*8, 244*CONCURRENT_THREADS*8, 245*CONCURRENT_THREADS*8, 246*CONCURRENT_THREADS*8, 247*CONCURRENT_THREADS*8, 248*CONCURRENT_THREADS*8, 249*CONCURRENT_THREADS*8, 250*CONCURRENT_THREADS*8, 251*CONCURRENT_THREADS*8, 252*CONCURRENT_THREADS*8, 253*CONCURRENT_THREADS*8, 254*CONCURRENT_THREADS*8, 
		255*CONCURRENT_THREADS*8, 256*CONCURRENT_THREADS*8, 257*CONCURRENT_THREADS*8, 258*CONCURRENT_THREADS*8, 259*CONCURRENT_THREADS*8, 260*CONCURRENT_THREADS*8, 261*CONCURRENT_THREADS*8, 262*CONCURRENT_THREADS*8, 263*CONCURRENT_THREADS*8, 264*CONCURRENT_THREADS*8, 265*CONCURRENT_THREADS*8, 266*CONCURRENT_THREADS*8, 267*CONCURRENT_THREADS*8, 268*CONCURRENT_THREADS*8, 269*CONCURRENT_THREADS*8, 270*CONCURRENT_THREADS*8, 271*CONCURRENT_THREADS*8, 272*CONCURRENT_THREADS*8, 273*CONCURRENT_THREADS*8, 274*CONCURRENT_THREADS*8, 275*CONCURRENT_THREADS*8, 276*CONCURRENT_THREADS*8, 277*CONCURRENT_THREADS*8, 278*CONCURRENT_THREADS*8, 279*CONCURRENT_THREADS*8, 280*CONCURRENT_THREADS*8, 281*CONCURRENT_THREADS*8, 282*CONCURRENT_THREADS*8, 283*CONCURRENT_THREADS*8, 284*CONCURRENT_THREADS*8, 285*CONCURRENT_THREADS*8, 286*CONCURRENT_THREADS*8, 
		287*CONCURRENT_THREADS*8, 288*CONCURRENT_THREADS*8, 289*CONCURRENT_THREADS*8, 290*CONCURRENT_THREADS*8, 291*CONCURRENT_THREADS*8, 292*CONCURRENT_THREADS*8, 293*CONCURRENT_THREADS*8, 294*CONCURRENT_THREADS*8, 295*CONCURRENT_THREADS*8, 296*CONCURRENT_THREADS*8, 297*CONCURRENT_THREADS*8, 298*CONCURRENT_THREADS*8, 299*CONCURRENT_THREADS*8, 300*CONCURRENT_THREADS*8, 301*CONCURRENT_THREADS*8, 302*CONCURRENT_THREADS*8, 303*CONCURRENT_THREADS*8, 304*CONCURRENT_THREADS*8, 305*CONCURRENT_THREADS*8, 306*CONCURRENT_THREADS*8, 307*CONCURRENT_THREADS*8, 308*CONCURRENT_THREADS*8, 309*CONCURRENT_THREADS*8, 310*CONCURRENT_THREADS*8, 311*CONCURRENT_THREADS*8, 312*CONCURRENT_THREADS*8, 313*CONCURRENT_THREADS*8, 314*CONCURRENT_THREADS*8, 315*CONCURRENT_THREADS*8, 316*CONCURRENT_THREADS*8, 317*CONCURRENT_THREADS*8, 318*CONCURRENT_THREADS*8, 
		319*CONCURRENT_THREADS*8, 320*CONCURRENT_THREADS*8, 321*CONCURRENT_THREADS*8, 322*CONCURRENT_THREADS*8, 323*CONCURRENT_THREADS*8, 324*CONCURRENT_THREADS*8, 325*CONCURRENT_THREADS*8, 326*CONCURRENT_THREADS*8, 327*CONCURRENT_THREADS*8, 328*CONCURRENT_THREADS*8, 329*CONCURRENT_THREADS*8, 330*CONCURRENT_THREADS*8, 331*CONCURRENT_THREADS*8, 332*CONCURRENT_THREADS*8, 333*CONCURRENT_THREADS*8, 334*CONCURRENT_THREADS*8, 335*CONCURRENT_THREADS*8, 336*CONCURRENT_THREADS*8, 337*CONCURRENT_THREADS*8, 338*CONCURRENT_THREADS*8, 339*CONCURRENT_THREADS*8, 340*CONCURRENT_THREADS*8, 341*CONCURRENT_THREADS*8, 342*CONCURRENT_THREADS*8, 343*CONCURRENT_THREADS*8, 344*CONCURRENT_THREADS*8, 345*CONCURRENT_THREADS*8, 346*CONCURRENT_THREADS*8, 347*CONCURRENT_THREADS*8, 348*CONCURRENT_THREADS*8, 349*CONCURRENT_THREADS*8, 350*CONCURRENT_THREADS*8, 
		351*CONCURRENT_THREADS*8, 352*CONCURRENT_THREADS*8, 353*CONCURRENT_THREADS*8, 354*CONCURRENT_THREADS*8, 355*CONCURRENT_THREADS*8, 356*CONCURRENT_THREADS*8, 357*CONCURRENT_THREADS*8, 358*CONCURRENT_THREADS*8, 359*CONCURRENT_THREADS*8, 360*CONCURRENT_THREADS*8, 361*CONCURRENT_THREADS*8, 362*CONCURRENT_THREADS*8, 363*CONCURRENT_THREADS*8, 364*CONCURRENT_THREADS*8, 365*CONCURRENT_THREADS*8, 366*CONCURRENT_THREADS*8, 367*CONCURRENT_THREADS*8, 368*CONCURRENT_THREADS*8, 369*CONCURRENT_THREADS*8, 370*CONCURRENT_THREADS*8, 371*CONCURRENT_THREADS*8, 372*CONCURRENT_THREADS*8, 373*CONCURRENT_THREADS*8, 374*CONCURRENT_THREADS*8, 375*CONCURRENT_THREADS*8, 376*CONCURRENT_THREADS*8, 377*CONCURRENT_THREADS*8, 378*CONCURRENT_THREADS*8, 379*CONCURRENT_THREADS*8, 380*CONCURRENT_THREADS*8, 381*CONCURRENT_THREADS*8, 382*CONCURRENT_THREADS*8, 
		383*CONCURRENT_THREADS*8, 384*CONCURRENT_THREADS*8, 385*CONCURRENT_THREADS*8, 386*CONCURRENT_THREADS*8, 387*CONCURRENT_THREADS*8, 388*CONCURRENT_THREADS*8, 389*CONCURRENT_THREADS*8, 390*CONCURRENT_THREADS*8, 391*CONCURRENT_THREADS*8, 392*CONCURRENT_THREADS*8, 393*CONCURRENT_THREADS*8, 394*CONCURRENT_THREADS*8, 395*CONCURRENT_THREADS*8, 396*CONCURRENT_THREADS*8, 397*CONCURRENT_THREADS*8, 398*CONCURRENT_THREADS*8, 399*CONCURRENT_THREADS*8, 400*CONCURRENT_THREADS*8, 401*CONCURRENT_THREADS*8, 402*CONCURRENT_THREADS*8, 403*CONCURRENT_THREADS*8, 404*CONCURRENT_THREADS*8, 405*CONCURRENT_THREADS*8, 406*CONCURRENT_THREADS*8, 407*CONCURRENT_THREADS*8, 408*CONCURRENT_THREADS*8, 409*CONCURRENT_THREADS*8, 410*CONCURRENT_THREADS*8, 411*CONCURRENT_THREADS*8, 412*CONCURRENT_THREADS*8, 413*CONCURRENT_THREADS*8, 414*CONCURRENT_THREADS*8, 
		415*CONCURRENT_THREADS*8, 416*CONCURRENT_THREADS*8, 417*CONCURRENT_THREADS*8, 418*CONCURRENT_THREADS*8, 419*CONCURRENT_THREADS*8, 420*CONCURRENT_THREADS*8, 421*CONCURRENT_THREADS*8, 422*CONCURRENT_THREADS*8, 423*CONCURRENT_THREADS*8, 424*CONCURRENT_THREADS*8, 425*CONCURRENT_THREADS*8, 426*CONCURRENT_THREADS*8, 427*CONCURRENT_THREADS*8, 428*CONCURRENT_THREADS*8, 429*CONCURRENT_THREADS*8, 430*CONCURRENT_THREADS*8, 431*CONCURRENT_THREADS*8, 432*CONCURRENT_THREADS*8, 433*CONCURRENT_THREADS*8, 434*CONCURRENT_THREADS*8, 435*CONCURRENT_THREADS*8, 436*CONCURRENT_THREADS*8, 437*CONCURRENT_THREADS*8, 438*CONCURRENT_THREADS*8, 439*CONCURRENT_THREADS*8, 440*CONCURRENT_THREADS*8, 441*CONCURRENT_THREADS*8, 442*CONCURRENT_THREADS*8, 443*CONCURRENT_THREADS*8, 444*CONCURRENT_THREADS*8, 445*CONCURRENT_THREADS*8, 446*CONCURRENT_THREADS*8, 
		447*CONCURRENT_THREADS*8, 448*CONCURRENT_THREADS*8, 449*CONCURRENT_THREADS*8, 450*CONCURRENT_THREADS*8, 451*CONCURRENT_THREADS*8, 452*CONCURRENT_THREADS*8, 453*CONCURRENT_THREADS*8, 454*CONCURRENT_THREADS*8, 455*CONCURRENT_THREADS*8, 456*CONCURRENT_THREADS*8, 457*CONCURRENT_THREADS*8, 458*CONCURRENT_THREADS*8, 459*CONCURRENT_THREADS*8, 460*CONCURRENT_THREADS*8, 461*CONCURRENT_THREADS*8, 462*CONCURRENT_THREADS*8, 463*CONCURRENT_THREADS*8, 464*CONCURRENT_THREADS*8, 465*CONCURRENT_THREADS*8, 466*CONCURRENT_THREADS*8, 467*CONCURRENT_THREADS*8, 468*CONCURRENT_THREADS*8, 469*CONCURRENT_THREADS*8, 470*CONCURRENT_THREADS*8, 471*CONCURRENT_THREADS*8, 472*CONCURRENT_THREADS*8, 473*CONCURRENT_THREADS*8, 474*CONCURRENT_THREADS*8, 475*CONCURRENT_THREADS*8, 476*CONCURRENT_THREADS*8, 477*CONCURRENT_THREADS*8, 478*CONCURRENT_THREADS*8, 
		479*CONCURRENT_THREADS*8, 480*CONCURRENT_THREADS*8, 481*CONCURRENT_THREADS*8, 482*CONCURRENT_THREADS*8, 483*CONCURRENT_THREADS*8, 484*CONCURRENT_THREADS*8, 485*CONCURRENT_THREADS*8, 486*CONCURRENT_THREADS*8, 487*CONCURRENT_THREADS*8, 488*CONCURRENT_THREADS*8, 489*CONCURRENT_THREADS*8, 490*CONCURRENT_THREADS*8, 491*CONCURRENT_THREADS*8, 492*CONCURRENT_THREADS*8, 493*CONCURRENT_THREADS*8, 494*CONCURRENT_THREADS*8, 495*CONCURRENT_THREADS*8, 496*CONCURRENT_THREADS*8, 497*CONCURRENT_THREADS*8, 498*CONCURRENT_THREADS*8, 499*CONCURRENT_THREADS*8, 500*CONCURRENT_THREADS*8, 501*CONCURRENT_THREADS*8, 502*CONCURRENT_THREADS*8, 503*CONCURRENT_THREADS*8, 504*CONCURRENT_THREADS*8, 505*CONCURRENT_THREADS*8, 506*CONCURRENT_THREADS*8, 507*CONCURRENT_THREADS*8, 508*CONCURRENT_THREADS*8, 509*CONCURRENT_THREADS*8, 510*CONCURRENT_THREADS*8, 
		511*CONCURRENT_THREADS*8};


 

/* N (nfactor), CPU/Memory cost parameter */
__constant uint N[] = {
	0x00000001U,  /* never used, padding */
	0x00000002U,
	0x00000004U,
	0x00000008U,
	0x00000010U,
	0x00000020U,
	0x00000040U,
	0x00000080U,
	0x00000100U,
	0x00000200U,
	0x00000400U,  /* 2^10 == 1024, Litecoin scrypt default */
	0x00000800U,
	0x00001000U,
	0x00002000U,
	0x00004000U,
	0x00008000U,
	0x00010000U,
	0x00020000U,
	0x00040000U,
	0x00080000U,
	0x00100000U
};

/* Backwards compatibility, if NFACTOR not defined, default to 10 for scrypt */
#ifndef NFACTOR
#define NFACTOR 10
#endif

//__constant uint ES[2] = { 0x00FF00FF, 0xFF00FF00 };

#define E0 0x00FF00FFU
#define E1 0xFF00FF00U


__constant uint K[] = {
	0x428a2f98U,
	0x71374491U,
	0xb5c0fbcfU,
	0xe9b5dba5U,
	0x3956c25bU,
	0x59f111f1U,
	0x923f82a4U,
	0xab1c5ed5U,
	0xd807aa98U,
	0x12835b01U,
	0x243185beU, // 10
	0x550c7dc3U,
	0x72be5d74U,
	0x80deb1feU,
	0x9bdc06a7U,
	0xe49b69c1U,
	0xefbe4786U,
	0x0fc19dc6U,
	0x240ca1ccU,
	0x2de92c6fU,
	0x4a7484aaU, // 20
	0x5cb0a9dcU,
	0x76f988daU,
	0x983e5152U,
	0xa831c66dU,
	0xb00327c8U,
	0xbf597fc7U,
	0xc6e00bf3U,
	0xd5a79147U,
	0x06ca6351U,
	0x14292967U, // 30
	0x27b70a85U,
	0x2e1b2138U,
	0x4d2c6dfcU,
	0x53380d13U,
	0x650a7354U,
	0x766a0abbU,
	0x81c2c92eU,
	0x92722c85U,
	0xa2bfe8a1U,
	0xa81a664bU, // 40
	0xc24b8b70U,
	0xc76c51a3U,
	0xd192e819U,
	0xd6990624U,
	0xf40e3585U,
	0x106aa070U,
	0x19a4c116U,
	0x1e376c08U,
	0x2748774cU,
	0x34b0bcb5U, // 50
	0x391c0cb3U,
	0x4ed8aa4aU,
	0x5b9cca4fU,
	0x682e6ff3U,
	0x748f82eeU,
	0x78a5636fU,
	0x84c87814U,
	0x8cc70208U,
	0x90befffaU,
	0xa4506cebU, // 60
	0xbef9a3f7U,
	0xc67178f2U,
	0x98c7e2a2U,
	0xfc08884dU,
	0xcd2a11aeU,
	0x510e527fU,
	0x9b05688cU,
	0xC3910C8EU,
	0xfb6feee7U,
	0x2a01a605U, // 70
	0x0c2e12e0U,
	0x4498517BU,
	0x6a09e667U,
	0xa4ce148bU,
	0x95F61999U,
	0xc19bf174U,
	0xBB67AE85U,
	0x3C6EF372U,
	0xA54FF53AU,
	0x1F83D9ABU, // 80
	0x5BE0CD19U
};


/*
	0x5C5C5C5CU,
	0x36363636U,
	0x80000000U,
//	0x000003FFU, //never used
	0x00000280U,
	0x000004a0U,
	0x00000300U
};
*/

#define FOURdeclare(myvar, v1, v2, v3, v4) uint myvar ## 1 = v1; \
                                           uint myvar ## 2 = v2; \
                                           uint myvar ## 3 = v3; \
                                           uint myvar ## 4 = v4;

#define FOURassign(myvar, v1, v2, v3, v4) myvar ## 1 = v1; \
                                          myvar ## 2 = v2; \
                                          myvar ## 3 = v3; \
                                          myvar ## 4 = v4;

#define FOURcopy(var1, var2) var1 ## 1 = var2 ## 1; \
                             var1 ## 2 = var2 ## 2; \
                             var1 ## 3 = var2 ## 3; \
                             var1 ## 4 = var2 ## 4;

#define FOURtovec(var1, var2) var1 .x = var2 ## 1; \
                              var1 .y = var2 ## 2; \
                              var1 .z = var2 ## 3; \
                              var1 .w = var2 ## 4;

#define FOURfromvec(var1, var2) var1 ## 1 = var2 .x; \
                                var1 ## 2 = var2 .y; \
                                var1 ## 3 = var2 .z; \
                                var1 ## 4 = var2 .w;

#define UNROLL_FACTOR 2


#define rotl(x,y) rotate(x,y)
#define Ch(x,y,z) bitselect(z,y,x)
#define Maj(x,y,z) Ch((x^z),y,z)

#define EndianSwapa(n) (Ch(E0, rotl(n, 8U), rotl(n, 24U)))
#define EndianSwapb(n) (rotl(n & E0, 24U)|rotl(n & E1, 8U))

#define Tr2(x)		(rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U))
#define Tr1(x)		(rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U))
#define Wr2(x)		(rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U))
#define Wr1(x)		(rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U))

#define RND(a, b, c, d, e, f, g, h, k)  \
	h += Tr1(e); 			\
	h += Ch(e, f, g); 		\
	h += k;				\
	d += h;				\
	h += Tr2(a); 			\
	h += Maj(a, b, c);

void SHA256(uint4 *restrict state0,uint4 *restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3, bool notfresh){

	
#define A S0a
#define B S0b
#define C S0c
#define D S0d
#define E S1a
#define F S1b
#define G S1c
#define H S1d

#define Wx a1
#define Wy a2
#define Wz a3
#define Ww a4

#define Xx b1
#define Xy b2
#define Xz b3 
#define Xw b4

#define Yx c1
#define Yy c2
#define Yz c3 
#define Yw c4

#define Zx d1
#define Zy d2
#define Zz d3 
#define Zw d4

	uint4 tmp0 = *state0;
	uint4 tmp1 = *state1;

	uint a1 = block0.x;
	uint b1 = block1.x;
	uint c1 = block2.x;
	uint d1 = block3.x;
	uint S0a = tmp0.x;
	uint S1a = tmp1.x;

	uint a2 = block0.y;
	uint b2 = block1.y;
	uint c2 = block2.y;
	uint d2 = block3.y;
	uint S0b = tmp0.y;
	uint S1b = tmp1.y;

	uint a3 = block0.z;
	uint b3 = block1.z;
	uint c3 = block2.z;
	uint d3 = block3.z;
	uint S0c = tmp0.z;
	uint S1c = tmp1.z;

	uint a4 = block0.w;
	uint b4 = block1.w;
	uint c4 = block2.w;
	uint d4 = block3.w;
	uint S0d = tmp0.w;
	uint S1d = tmp1.w;

	uint K0, K1, K2, K3, K4, K5, K6, K7;

	if(notfresh){
		K0 = K[0];
		K1 = K[1];
		K2 = K[2];
		K3 = K[3];
		RND(A,B,C,D,E,F,G,H, K0+Wx);
		RND(H,A,B,C,D,E,F,G, K1+Wy);
		RND(G,H,A,B,C,D,E,F, K2+Wz);
		RND(F,G,H,A,B,C,D,E, K3+Ww);
	}else{
		K0 = K[63];
		K1 = K[64];
		K2 = K[65];
		K3 = K[66];
		K4 = K[67];
		K5 = K[68];
		K6 = K[69];
		K7 = K[70];

		D= K0 +Wx;
		H= K1 +Wx;
		C= K2 +Tr1(D)+Ch(D, K3, K4)+Wy;
		G= K5 +C+Tr2(H)+Ch(H, K6 ,K7);

		tmp1.x = K3;
		tmp1.y = K4;

		K4 = K[71];
		K5 = K[72];
		K6 = K[73];
		K7 = K[74];
		K0 = K[75];

		B= K4 +Tr1(C)+Ch(C,D,K3)+Wz;
		F= K5 +B+Tr2(G)+Maj(G,H, K6);
		A= K7 +Tr1(B)+Ch(B,C,D)+Ww;
		E= K0 +A+Tr2(F)+Maj(F,G,H);

		tmp0.x = K6;
		tmp0.y = K[77];
		tmp0.z = K[78];
		tmp0.w = K[79];
		tmp1.z = K[80];
		tmp1.w = K[81];
	}

	K0 = K[4];
	K1 = K[5];
	K2 = K[6];
	K3 = K[7];
	RND(E,F,G,H,A,B,C,D,K0+Xx);
	RND(D,E,F,G,H,A,B,C,K1+Xy);
	RND(C,D,E,F,G,H,A,B,K2+Xz);
	RND(B,C,D,E,F,G,H,A,K3+Xw);
	K4 = K[8];
	K5 = K[9];
	K6 = K[10];
	K7 = K[11];
	RND(A,B,C,D,E,F,G,H,K4+Yx);
	RND(H,A,B,C,D,E,F,G,K5+Yy);
	RND(G,H,A,B,C,D,E,F,K6+Yz);
	RND(F,G,H,A,B,C,D,E,K7+Yw);
	K7 = K[76];
	K0 = K[12];
	K1 = K[13];
	K2 = K[14];
	RND(E,F,G,H,A,B,C,D,K0+Zx);
	RND(D,E,F,G,H,A,B,C,K1+Zy);
	RND(C,D,E,F,G,H,A,B,K2+Zz);
	RND(B,C,D,E,F,G,H,A,K7+Zw);
	K3 = K[15];
	K4 = K[16];
	K5 = K[17];
	K6 = K[18];
	Wx += Wr1(Zz) + Yy + Wr2(Wy);
	RND(A,B,C,D,E,F,G,H, Wx+ K3);
	Wy += Wr1(Zw) + Yz + Wr2(Wz);
	RND(H,A,B,C,D,E,F,G, Wy+ K4);
	Wz += Wr1(Wx) + Yw + Wr2(Ww);
	RND(G,H,A,B,C,D,E,F, Wz+ K5);
	Ww += Wr1(Wy) + Zx + Wr2(Xx);
	RND(F,G,H,A,B,C,D,E, Ww+ K6);
	K0 = K[19];
	K1 = K[20];
	K2 = K[21];
	K3 = K[22];
	Xx += Wr1(Wz) + Zy + Wr2(Xy);
	RND(E,F,G,H,A,B,C,D, Xx+ K0);
	Xy += Wr1(Ww) + Zz + Wr2(Xz);
	RND(D,E,F,G,H,A,B,C, Xy+ K1);
	Xz += Wr1(Xx) + Zw + Wr2(Xw);
	RND(C,D,E,F,G,H,A,B, Xz+ K2);
	Xw += Wr1(Xy) + Wx + Wr2(Yx);
	RND(B,C,D,E,F,G,H,A, Xw+ K3);
	K4 = K[23];
	K5 = K[24];
	K6 = K[25];
	K7 = K[26];
	Yx += Wr1(Xz) + Wy + Wr2(Yy);
	RND(A,B,C,D,E,F,G,H, Yx+ K4);
	Yy += Wr1(Xw) + Wz + Wr2(Yz);
	RND(H,A,B,C,D,E,F,G, Yy+ K5);
	Yz += Wr1(Yx) + Ww + Wr2(Yw);
	RND(G,H,A,B,C,D,E,F, Yz+ K6);
	Yw += Wr1(Yy) + Xx + Wr2(Zx);
	RND(F,G,H,A,B,C,D,E, Yw+ K7);
	K0 = K[27];
	K1 = K[28];
	K2 = K[29];
	K3 = K[30];
	Zx += Wr1(Yz) + Xy + Wr2(Zy);
	RND(E,F,G,H,A,B,C,D, Zx+ K0);
	Zy += Wr1(Yw) + Xz + Wr2(Zz);
	RND(D,E,F,G,H,A,B,C, Zy+ K1);
	Zz += Wr1(Zx) + Xw + Wr2(Zw);
	RND(C,D,E,F,G,H,A,B, Zz+ K2);
	Zw += Wr1(Zy) + Yx + Wr2(Wx);
	RND(B,C,D,E,F,G,H,A, Zw+ K3);
	K4 = K[31];
	K5 = K[32];
	K6 = K[33];
	K7 = K[34];
	Wx += Wr1(Zz) + Yy + Wr2(Wy);
	RND(A,B,C,D,E,F,G,H, Wx+ K4);
	Wy += Wr1(Zw) + Yz + Wr2(Wz);
	RND(H,A,B,C,D,E,F,G, Wy+ K5);
	Wz += Wr1(Wx) + Yw + Wr2(Ww);
	RND(G,H,A,B,C,D,E,F, Wz+ K6);
	Ww += Wr1(Wy) + Zx + Wr2(Xx);
	RND(F,G,H,A,B,C,D,E, Ww+ K7);
	K0 = K[35];
	K1 = K[36];
	K2 = K[37];
	K3 = K[38];
	Xx += Wr1(Wz) + Zy + Wr2(Xy);
	RND(E,F,G,H,A,B,C,D, Xx+ K0);
	Xy += Wr1(Ww) + Zz + Wr2(Xz);
	RND(D,E,F,G,H,A,B,C, Xy+ K1);
	Xz += Wr1(Xx) + Zw + Wr2(Xw);
	RND(C,D,E,F,G,H,A,B, Xz+ K2);
	Xw += Wr1(Xy) + Wx + Wr2(Yx);
	RND(B,C,D,E,F,G,H,A, Xw+ K3);
	K4 = K[39];
	K5 = K[40];
	K6 = K[41];
	K7 = K[42];
	Yx += Wr1(Xz) + Wy + Wr2(Yy);
	RND(A,B,C,D,E,F,G,H, Yx+ K4);
	Yy += Wr1(Xw) + Wz + Wr2(Yz);
	RND(H,A,B,C,D,E,F,G, Yy+ K5);
	Yz += Wr1(Yx) + Ww + Wr2(Yw);
	RND(G,H,A,B,C,D,E,F, Yz+ K6);
	Yw += Wr1(Yy) + Xx + Wr2(Zx);
	RND(F,G,H,A,B,C,D,E, Yw+ K7);
	K0 = K[43];
	K1 = K[44];
	K2 = K[45];
	K3 = K[46];
	Zx += Wr1(Yz) + Xy + Wr2(Zy);
	RND(E,F,G,H,A,B,C,D, Zx+ K0);
	Zy += Wr1(Yw) + Xz + Wr2(Zz);
	RND(D,E,F,G,H,A,B,C, Zy+ K1);
	Zz += Wr1(Zx) + Xw + Wr2(Zw);
	RND(C,D,E,F,G,H,A,B, Zz+ K2);
	Zw += Wr1(Zy) + Yx + Wr2(Wx);
	RND(B,C,D,E,F,G,H,A, Zw+ K3);
	K4 = K[47];
	K5 = K[48];
	K6 = K[49];
	K7 = K[50];
	Wx += Wr1(Zz) + Yy + Wr2(Wy);
	RND(A,B,C,D,E,F,G,H, Wx+ K4);
	Wy += Wr1(Zw) + Yz + Wr2(Wz);
	RND(H,A,B,C,D,E,F,G, Wy+ K5);
	Wz += Wr1(Wx) + Yw + Wr2(Ww);
	RND(G,H,A,B,C,D,E,F, Wz+ K6);
	Ww += Wr1(Wy) + Zx + Wr2(Xx);
	RND(F,G,H,A,B,C,D,E, Ww+ K7);
	K0 = K[51];
	K1 = K[52];
	K2 = K[53];
	K3 = K[54];
	Xx += Wr1(Wz) + Zy + Wr2(Xy);
	RND(E,F,G,H,A,B,C,D, Xx+ K0);
	Xy += Wr1(Ww) + Zz + Wr2(Xz);
	RND(D,E,F,G,H,A,B,C, Xy+ K1);
	Xz += Wr1(Xx) + Zw + Wr2(Xw);
	RND(C,D,E,F,G,H,A,B, Xz+ K2);
	Xw += Wr1(Xy) + Wx + Wr2(Yx);
	RND(B,C,D,E,F,G,H,A, Xw+ K3);
	K4 = K[55];
	K5 = K[56];
	K6 = K[57];
	K7 = K[58];
	Yx += Wr1(Xz) + Wy + Wr2(Yy);
	RND(A,B,C,D,E,F,G,H, Yx+ K4);
	Yy += Wr1(Xw) + Wz + Wr2(Yz);
	RND(H,A,B,C,D,E,F,G, Yy+ K5);
	Yz += Wr1(Yx) + Ww + Wr2(Yw);
	RND(G,H,A,B,C,D,E,F, Yz+ K6);
	Yw += Wr1(Yy) + Xx + Wr2(Zx);
	RND(F,G,H,A,B,C,D,E, Yw+ K7);
	K4 = K[59];
	K5 = K[60];
	K6 = K[61];
	K7 = K[62];
	Zx += Wr1(Yz) + Xy + Wr2(Zy);
	RND(E,F,G,H,A,B,C,D, Zx+ K4);
	Zy += Wr1(Yw) + Xz + Wr2(Zz);
	RND(D,E,F,G,H,A,B,C, Zy+ K5);
	Zz += Wr1(Zx) + Xw + Wr2(Zw);
	RND(C,D,E,F,G,H,A,B, Zz+ K6);
	Zw += Wr1(Zy) + Yx + Wr2(Wx);
	RND(B,C,D,E,F,G,H,A, Zw+ K7);
	
#undef A
#undef B
#undef C
#undef D
#undef E
#undef F
#undef G
#undef H

#undef Wx
#undef Wy
#undef Wz
#undef Ww

#undef Xx
#undef Xy
#undef Xz
#undef Xw

#undef Yx
#undef Yy
#undef Yz
#undef Yw

#undef Zx
#undef Zy
#undef Zz
#undef Zw

	tmp0 += (uint4)(S0a, S0b, S0c, S0d);
	tmp1 += (uint4)(S1a, S1b, S1c, S1d);

	*state0 = tmp0;
	*state1 = tmp1;
}

void halfsalsa(uint4 *w){
	for(uint i=0; i<4; ++i){
		w[0] ^= rotl(w[3]     +w[2]     , 7U);
		w[1] ^= rotl(w[0]     +w[3]     , 9U);
		w[2] ^= rotl(w[1]     +w[0]     ,13U);
		w[3] ^= rotl(w[2]     +w[1]     ,18U);
		w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
		w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
		w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
		w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
	}
}

//x[ 0] ^= (x[ 3] ? x[ 2])<<<18;

#if (LOOKUP_GAP == 2)
void salsa(uint4 *B, bool db){
#else
void salsa(uint4 *B){
#endif
    uint4 w[4];

	for(uint i=0; i<4; ++i)
		w[i] = (B[i]^=B[i+4]);
	halfsalsa(w);
	for(uint i=0; i<4; ++i)
		w[i] = (B[i+4]^=(B[i]+=w[i]));
	halfsalsa(w);
#if (LOOKUP_GAP == 2)
	if(db){
		for(uint i=0; i<4; ++i)
			w[i] = (B[i]^=(B[i+4]+=w[i]));
		halfsalsa(w);
		for(uint i=0; i<4; ++i)
			w[i] = (B[i+4]^=(B[i]+=w[i]));
		halfsalsa(w);
	}
#endif
	for(uint i=0; i<4; ++i)
		B[i+4] += w[i];
}

//#define Coord(x,y,z) (ulong)y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE)+x
//#define CO Coord(z,x,y)

//#define Coord(x,y,z) ((ulong)y*(x ## SIZE)*(z ## SIZE)+x*(z ## SIZE)+z)
//#define CO Coord(x,y,z)


//#define Coord(x,y,z) (ulong)x*(z ## SIZE)+y*(x ## SIZE)*(z ## SIZE)+z
//#define CO Coord(x,y,z)

//#define Coord(x,y,z) (ulong)x+y*(x ## SIZE)*(z ## SIZE)+z
//#define CO Coord(x,y,z)


//#define Coord(x,y,z) ((ulong)y*(x ## SIZE)*(z ## SIZE)+x+z)
//#define CO Coord(x,y,z)

void scrypt_core(uint4 *X, __global uint4 *restrict lookup){
	const uint zSIZE = 8;
	//const uint ySIZE = (N[NFACTOR]/LOOKUP_GAP+(N[NFACTOR]%LOOKUP_GAP>0));
	const uint xSIZE = CONCURRENT_THREADS;
	//uint x = get_global_id(0)%xSIZE;
	uint4 tmp[4];

	//const uint loop_ceiling = ySIZE*zSIZE-zSIZE;
	//const uint loop_ceiling = N[NFACTOR]/LOOKUP_GAP*zSIZE-zSIZE;
	uint x = (get_global_id(0)%xSIZE)*zSIZE;
	//uint E0 = ES[0];
	//uint E1 = ES[1];

	tmp[0] = (uint4)(X[1].x,X[2].y,X[3].z,X[0].w);
	tmp[1] = (uint4)(X[2].x,X[3].y,X[0].z,X[1].w);
	tmp[2] = (uint4)(X[3].x,X[0].y,X[1].z,X[2].w);
	tmp[3] = (uint4)(X[0].x,X[1].y,X[2].z,X[3].w);

	X[0] = EndianSwapa(tmp[0]);
	X[1] = EndianSwapb(tmp[1]);
	X[2] = EndianSwapb(tmp[2]);
	X[3] = EndianSwapb(tmp[3]);

	tmp[0] = (uint4)(X[5].x,X[6].y,X[7].z,X[4].w);
	tmp[1] = (uint4)(X[6].x,X[7].y,X[4].z,X[5].w);
	tmp[2] = (uint4)(X[7].x,X[4].y,X[5].z,X[6].w);
	tmp[3] = (uint4)(X[4].x,X[5].y,X[6].z,X[7].w);

	X[4] = EndianSwapa(tmp[0]);
	X[5] = EndianSwapb(tmp[1]);
	X[6] = EndianSwapb(tmp[2]);
	X[7] = EndianSwapb(tmp[3]);

	for(uint y=0; y<(N[NFACTOR]/LOOKUP_GAP); ++y){
		for(uint z=0; z<zSIZE; ++z)
			lookup[IDX] = X[z];

//	for(uint y=0; y<loop_ceiling; y+=zSIZE){
//		for(uint z=0; z<zSIZE; ++z)
//			lookup[IDX] = X[z];
#if (LOOKUP_GAP == 2)
		salsa(X, 1);
#elif (LOOKUP_GAP == 1)
		salsa(X);
#else
		for(uint i=0; i<LOOKUP_GAP; ++i)
			salsa(X);
#endif
	}
#if (LOOKUP_GAP != 1) && (LOOKUP_GAP != 2) && (LOOKUP_GAP != 4) && (LOOKUP_GAP != 8)
        uint y = (N[NFACTOR]/LOOKUP_GAP);
		//uint y = loop_ceiling;
        for(uint z=0; z<zSIZE; ++z)
            lookup[IDX] = X[z];
        for(uint i=0; i<N[NFACTOR]%LOOKUP_GAP; ++i)
            salsa(X);
#endif

#if (LOOKUP_GAP != 1)
    for (uint i=0; i<N[NFACTOR]; ++i){
        uint j = X[7].x & (N[NFACTOR]-1);
#else
	for (uint i=0; i<N[NFACTOR]; ++i){
		uint y = X[7].x & (N[NFACTOR]-1);
#endif

#if (LOOKUP_GAP == 1)
		
#elif (LOOKUP_GAP == 2)
		uint y = (j>>1);
#elif (LOOKUP_GAP == 4)
		uint y = (j>>2);
#elif (LOOKUP_GAP == 8)
		uint y = (j>>3);
#else
		uint y = (j/LOOKUP_GAP);
#endif

#if (LOOKUP_GAP != 2) && (LOOKUP_GAP != 1)
		uint4 V[8];
		for(uint z=0; z<zSIZE; ++z)
			V[z] = lookup[IDX];
#endif

#if (LOOKUP_GAP == 1)
		for(uint z=0; z<zSIZE; ++z)
			X[z] ^= lookup[IDX];
#elif (LOOKUP_GAP == 2)
		if(j&1){
			uint4 V[8];
			for(uint z=0; z<zSIZE; ++z)
				V[z] = lookup[IDX];
			salsa(V, 0);
			for(uint z=0; z<zSIZE; ++z)
				X[z] ^= V[z];
		}else{

			for(uint z=0; z<zSIZE; ++z)
				X[z] ^= lookup[IDX];

			//	X[0] ^= lookup[IDX];
			//	X[1] ^= lookup[IDX];
			//	X[2] ^= lookup[IDX];
			//	X[3] ^= lookup[IDX];
			//	X[4] ^= lookup[IDX];
			//	X[5] ^= lookup[IDX];
			//	X[6] ^= lookup[IDX];
			//	X[7] ^= lookup[IDX];
		}
#else
		//uint val = j%LOOKUP_GAP;
	
		for(uint z=0; z<zSIZE; ++z)
			X[z] ^= V[z];
		//uint val = j%LOOKUP_GAP;
		//for (uint z=0; z<val; ++z)
		//	salsa(V);


#endif

#if (LOOKUP_GAP != 2) && (LOOKUP_GAP != 1)
#endif

#if (LOOKUP_GAP == 2)
        salsa(X, 0);
#elif (LOOKUP_GAP == 1)
        salsa(X);
#else
		uint val = j%LOOKUP_GAP;
		for (uint z=0; z<val; ++z)
			salsa(V);
#endif
    }

	tmp[0] = (uint4)(X[3].x,X[2].y,X[1].z,X[0].w);
	tmp[1] = (uint4)(X[0].x,X[3].y,X[2].z,X[1].w);
	tmp[2] = (uint4)(X[1].x,X[0].y,X[3].z,X[2].w);
	tmp[3] = (uint4)(X[2].x,X[1].y,X[0].z,X[3].w);

	X[0] = EndianSwapa(tmp[0]);
	X[1] = EndianSwapb(tmp[1]);
	X[2] = EndianSwapb(tmp[2]);
	X[3] = EndianSwapb(tmp[3]);

	tmp[0] = (uint4)(X[7].x,X[6].y,X[5].z,X[4].w);
	tmp[1] = (uint4)(X[4].x,X[7].y,X[6].z,X[5].w);
	tmp[2] = (uint4)(X[5].x,X[4].y,X[7].z,X[6].w);
	tmp[3] = (uint4)(X[6].x,X[5].y,X[4].z,X[7].w);

	X[4] = EndianSwapa(tmp[0]);
	X[5] = EndianSwapb(tmp[1]);
	X[6] = EndianSwapb(tmp[2]);
	X[7] = EndianSwapb(tmp[3]);
}

__constant uint fixedWa[8] = {0x428a2f99,0xd807aa98,0xf59b89c2,0xb707775c,0xad87a3ea,0xc91b1417,0xe64fb6a2,0xe0a1adbe};
__constant uint fixedWb[8] = {0xf1374491,0x12835b01,0x73924787,0x0468c23f,0xbcb1d3a3,0xc359dce1,0xe84d923a,0x7c728e11};
__constant uint fixedWc[8] = {0xb5c0fbcf,0x243185be,0x23c6886e,0xe7e72b4c,0x7b993186,0xa83253a7,0xe93a5730,0x511c78e4};
__constant uint fixedWd[8] = {0xe9b5dba5,0x550c7dc3,0xa42ca65c,0x49e1f1a2,0x562b9420,0x3b13c12d,0x09837686,0x315b45bd};
__constant uint fixedWe[8] = {0x3956c25b,0x72be5d74,0x15ed3627,0x4b99c816,0xbff3ca0c,0x9d3d725d,0x078ff753,0xfca71413};
__constant uint fixedWf[8] = {0x59f111f1,0x80deb1fe,0x4d6edcbf,0x926d1570,0xda4b0c23,0xd9031a84,0x29833341,0xea28f96a};
__constant uint fixedWg[8] = {0x923f82a4,0x9bdc06a7,0xe28217fc,0xaa0fc072,0x6cd8711a,0xb1a03340,0xd5de0b7e,0x79703128};
__constant uint fixedWh[8] = {0xab1c5ed5,0xc19bf794,0xef02488f,0xadb36e2c,0x8f337caa,0x16f58012,0x6948ccf4,0x4e1ef848};
__constant uint sK[6] = {
    0x5C5C5C5CU, //82
    0x36363636U,
    0x80000000U,
//  0x000003FFU, //never used
    0x00000280U,
    0x000004a0U,
    0x00000300U
};

#define FOUND (0xFF)
#define SETFOUND(Xnonce) output[output[FOUND]++] = Xnonce

__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(__global const uint4 *restrict input,
volatile __global uint *restrict output, __global uint4 *restrict padcache, const uint4 midstate0, const uint4 midstate16, const uint target){

	uint gid = get_global_id(0);
	uint4 X[8];
	uint4 tmp0, tmp1;
//uint4 tstate0, tstate1, ostate0, ostate1, tmp0, tmp1;
	//uint4 data = (uint4)(input[4].x,input[4].y,input[4].z,gid);
	//uint4 pad0 = midstate0, pad1 = midstate16;

	uint4 pass0, pass1, pass2;

	FOURdeclare(tstate0, 0, 0, 0, 0);
	FOURdeclare(tstate1, 0, 0, 0, 0);
	FOURdeclare(ostate0, 0, 0, 0, 0);
	FOURdeclare(ostate1, 0, 0, 0, 0);
	FOURdeclare(data, input[4].x,input[4].y,input[4].z,gid);

	//FOURdeclare(data, input[12], input[13], input[14], gid);
	//FOURdeclare(pad0, midstate0[0], midstate0[1], midstate0[2], midstate0[3]);
	//FOURdeclare(pad1, midstate16[0], midstate16[1], midstate16[2], midstate16[3]);

	FOURdeclare(pad0, midstate0.x, midstate0.y, midstate0.z, midstate0.w);
	FOURdeclare(pad1, midstate16.x, midstate16.y, midstate16.z, midstate16.w);

	FOURtovec(pass0, pad0);
	FOURtovec(pass1, pad1);
	FOURtovec(pass2, data);

	SHA256(&pass0,&pass1, pass2, (uint4)(sK[2],0,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, sK[3]), 1);
	FOURfromvec(pad0, pass0);
	FOURfromvec(pad1, pass1);
	//FOURfromvec(data, pass2);

    //FOURtovec(pass0, ostate0);
    //FOURtovec(pass1, ostate1);
	tmp0 = pass0^sK[0];
	tmp1 = pass1^sK[0];
	FOURtovec(pass0, ostate0);
	FOURtovec(pass1, ostate1);
	SHA256(&pass0, &pass1, tmp0, tmp1, sK[0], sK[0], 0);
    FOURfromvec(ostate0, pass0);
    FOURfromvec(ostate1, pass1);

	FOURtovec(pass0, pad0);
	FOURtovec(pass1, pad1);
	tmp0 = pass0^sK[1];
	tmp1 = pass1^sK[1];
	FOURtovec(pass0, tstate0);
	FOURtovec(pass1, tstate1);
	SHA256(&pass0, &pass1, tmp0, tmp1, sK[1], sK[1], 0);
	FOURfromvec(tstate0, pass0);
	FOURfromvec(tstate1, pass1);

	tmp0 = pass0;
	tmp1 = pass1;

	//FOURtovec(pass0, tstate0);
	//FOURtovec(pass1, tstate1);
	SHA256(&pass0, &pass1, input[0],input[1],input[2],input[3], 1);
	FOURfromvec(tstate0, pass0);
	FOURfromvec(tstate1, pass1);

/*

    SHA256(&pad0,&pad1, data, (uint4)(sK[2],0,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, sK[3]), 1);
    SHA256(&ostate0,&ostate1, pad0^sK[0], pad1^sK[0], sK[0], sK[0], 0);
    SHA256(&tstate0,&tstate1, pad0^sK[1], pad1^sK[1], sK[1], sK[1], 0);

    tmp0 = tstate0;
    tmp1 = tstate1;
    SHA256(&tstate0, &tstate1, input[0],input[1],input[2],input[3], 1);


	for (uint i=0; i<4; i++){
		pad0 = tstate0;
		pad1 = tstate1;
		X[(i<<1) ] = ostate0;
		X[(i<<1)+1] = ostate1;
		SHA256(&pad0, &pad1, data, (uint4)(i+1,sK[2],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, sK[4]), 1);
		SHA256(X+(i<<1),X+(i<<1)+1, pad0, pad1, (uint4)(sK[2], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, sK[5]), 1);
	}
*/

	for (uint i=0; i<4; i++){
		FOURcopy(pad0, tstate0);
		FOURcopy(pad1, tstate1);

		//FOURtovec( X[(i<<1) ], ostate0);
		//FOURtovec( X[(i<<1)+1], ostate1);
		FOURtovec(pass0, ostate0);
		FOURtovec(pass1, ostate1);
		X[(i<<1) ] = pass0;
		X[(i<<1)+1] = pass1;

		FOURtovec(pass0, pad0);
		FOURtovec(pass1, pad1);
		FOURtovec(pass2, data);
		SHA256(&pass0, &pass1, pass2, (uint4)(i+1,sK[2],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, sK[4]), 1);
		FOURfromvec(pad0, pass0);
		FOURfromvec(pad1, pass1);
		SHA256(X+(i<<1),X+(i<<1)+1, pass0, pass1, (uint4)(sK[2], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, sK[5]), 1);
	}


	//scrypt_write(X,padcache);
	//scrypt_read(X,padcache);

	scrypt_core(X,padcache);

	SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3], 1);
	SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7], 1);

	//tstate0 = tmp0;
	//tstate1 = tmp1;
	FOURfromvec(tstate0, tmp0);
	FOURfromvec(tstate1, tmp1);

#define A tstate01
#define B tstate02
#define C tstate03
#define D tstate04
#define E tstate11
#define F tstate12
#define G tstate13
#define H tstate14

//#pragma unroll UNROLL_FACTOR
	for(uint i=0; i<8; i++){
		RND(A,B,C,D,E,F,G,H, fixedWa[i]);
		RND(H,A,B,C,D,E,F,G, fixedWb[i]);
		RND(G,H,A,B,C,D,E,F, fixedWc[i]);
		RND(F,G,H,A,B,C,D,E, fixedWd[i]);
		RND(E,F,G,H,A,B,C,D, fixedWe[i]);
		RND(D,E,F,G,H,A,B,C, fixedWf[i]);
		RND(C,D,E,F,G,H,A,B, fixedWg[i]);
		RND(B,C,D,E,F,G,H,A, fixedWh[i]);
	}

#undef A
#undef B
#undef C
#undef D
#undef E
#undef F
#undef G
#undef H

	FOURtovec(pass0, tstate0);
	FOURtovec(pass1, tstate1);

	tmp0 += pass0;
	tmp1 += pass1;

	FOURtovec(pass2, ostate0);
	FOURtovec(pass0, ostate1);
	SHA256(&pass2,&pass0, tmp0, tmp1, (uint4)(sK[2], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, sK[5]), 1);

	//FOURfromvec(ostate0, pass2);
	FOURfromvec(ostate1, pass0);
    //uint E0 = ES[0];
	bool result = (EndianSwapa(ostate14) <= target);
	if (result)
		SETFOUND(gid);
}
