/*-
 * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt,
 * 2012-2013 Con Kolivas.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * This file was originally written by Colin Percival as part of the Tarsnap
 * online backup system.
 */

/* N (nfactor), CPU/Memory cost parameter */
__constant uint N[] = {
    0x00000001U,  /* never used, padding */
    0x00000002U,
    0x00000004U,
    0x00000008U,
    0x00000010U,
    0x00000020U,
    0x00000040U,
    0x00000080U,
    0x00000100U,
    0x00000200U,
    0x00000400U,  /* 2^10 == 1024, Litecoin scrypt default */
    0x00000800U,
    0x00001000U,
    0x00002000U,
    0x00004000U,
    0x00008000U,
    0x00010000U,
    0x00020000U,
    0x00040000U,
    0x00080000U,
    0x00100000U
};

__constant uint K[] = {
	0x428a2f98U,
	0x71374491U,
	0xb5c0fbcfU,
	0xe9b5dba5U,
	0x3956c25bU,
	0x59f111f1U,
	0x923f82a4U,
	0xab1c5ed5U,
	0xd807aa98U,
	0x12835b01U,
	0x243185beU, // 10
	0x550c7dc3U,
	0x72be5d74U,
	0x80deb1feU,
	0x9bdc06a7U,
	0xe49b69c1U,
	0xefbe4786U,
	0x0fc19dc6U,
	0x240ca1ccU,
	0x2de92c6fU,
	0x4a7484aaU, // 20
	0x5cb0a9dcU,
	0x76f988daU,
	0x983e5152U,
	0xa831c66dU,
	0xb00327c8U,
	0xbf597fc7U,
	0xc6e00bf3U,
	0xd5a79147U,
	0x06ca6351U,
	0x14292967U, // 30
	0x27b70a85U,
	0x2e1b2138U,
	0x4d2c6dfcU,
	0x53380d13U,
	0x650a7354U,
	0x766a0abbU,
	0x81c2c92eU,
	0x92722c85U,
	0xa2bfe8a1U,
	0xa81a664bU, // 40
	0xc24b8b70U,
	0xc76c51a3U,
	0xd192e819U,
	0xd6990624U,
	0xf40e3585U,
	0x106aa070U,
	0x19a4c116U,
	0x1e376c08U,
	0x2748774cU,
	0x34b0bcb5U, // 50
	0x391c0cb3U,
	0x4ed8aa4aU,
	0x5b9cca4fU,
	0x682e6ff3U,
	0x748f82eeU,
	0x78a5636fU,
	0x84c87814U,
	0x8cc70208U,
	0x90befffaU,
	0xa4506cebU, // 60
	0xbef9a3f7U,
	0xc67178f2U,
	0x98c7e2a2U,
	0xfc08884dU,
	0xcd2a11aeU,
	0x510e527fU,
	0x9b05688cU,
	0xC3910C8EU,
	0xfb6feee7U,
	0x2a01a605U, // 70
	0x0c2e12e0U,
	0x4498517BU,
	0x6a09e667U,
	0xa4ce148bU,
	0x95F61999U,
	0xc19bf174U,
	0xBB67AE85U,
	0x3C6EF372U,
	0xA54FF53AU,
	0x1F83D9ABU, // 80
	0x5BE0CD19U,
    0x5C5C5C5CU,
    0x36363636U,
    0x80000000U,
    0x000003FFU,
    0x00000280U,
    0x000004a0U,
    0x00000300U
};


#ifndef NFACTOR
#define NFACTOR 10
#endif



#define E0 0x00FF00FF
#define E1 0xFF00FF00

#define rotl(x,y) rotate(x,y)
#define Ch(x,y,z) bitselect(z,y,x)
#define Maj(x,y,z) Ch((x^z),y,z)

#define EndianSwapa(n) (Ch(E0, rotl(n, 8U), rotl(n, 24U)))
#define EndianSwapb(n) (rotl(n & E0, 24U)|rotl(n & E1, 8U))

#define Tr2(x)		(rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U))
#define Tr1(x)		(rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U))
#define Wr2(x)		(rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U))
#define Wr1(x)		(rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U))

#define RND(a, b, c, d, e, f, g, h, k)	\
	h += Tr1(e); 			\
	h += Ch(e, f, g); 		\
	h += k;				\
	d += h;				\
	h += Tr2(a); 			\
	h += Maj(a, b, c);

void uintp_to_uint4p(uint4 *dest, uint *source){
    uint4 tmp;
    tmp.x = source[0];
    tmp.y = source[1];
    tmp.z = source[2];
    tmp.w = source[3];
    *dest = tmp;
}

void uint4p_to_uintp(uint *dest, uint4 *source){
    uint4 tmp = *source;;
        dest[0] = tmp.x;
        dest[1] = tmp.y;
        dest[2] = tmp.z;
        dest[3] = tmp.w;
}

void uint4_to_uintp(uint *dest, uint4 source){
        dest[0] = source.x;
        dest[1] = source.y;
        dest[2] = source.z;
        dest[3] = source.w;
}

void SHA256(uint*restrict state, uint*restrict block){
    uint A = state[0];
    uint B = state[1];
    uint C = state[2];
    uint D = state[3];
    uint E = state[4];
    uint F = state[5];
    uint G = state[6];
    uint H = state[7];
	uint W0x = block[0];
	uint W0y = block[1];
	uint W0z = block[2];
	uint W0w = block[3];
    uint W1x = block[4];
    uint W1y = block[5];
    uint W1z = block[6];
    uint W1w = block[7];
    uint W2x = block[8];
    uint W2y = block[9];
    uint W2z = block[10];
    uint W2w = block[11];
    uint W3x = block[12];
    uint W3y = block[13];
    uint W3z = block[14];
    uint W3w = block[15];

	RND(A,B,C,D,E,F,G,H, W0x + K[0]);
	RND(H,A,B,C,D,E,F,G, W0y + K[1]);
	RND(G,H,A,B,C,D,E,F, W0z + K[2]);
	RND(F,G,H,A,B,C,D,E, W0w + K[3]);

	RND(E,F,G,H,A,B,C,D, W1x + K[4]);
	RND(D,E,F,G,H,A,B,C, W1y + K[5]);
	RND(C,D,E,F,G,H,A,B, W1z + K[6]);
	RND(B,C,D,E,F,G,H,A, W1w + K[7]);

	RND(A,B,C,D,E,F,G,H, W2x + K[8]);
	RND(H,A,B,C,D,E,F,G, W2y + K[9]);
	RND(G,H,A,B,C,D,E,F, W2z + K[10]);
	RND(F,G,H,A,B,C,D,E, W2w + K[11]);

	RND(E,F,G,H,A,B,C,D, W3x + K[12]);
	RND(D,E,F,G,H,A,B,C, W3y + K[13]);
	RND(C,D,E,F,G,H,A,B, W3z + K[14]);
	RND(B,C,D,E,F,G,H,A, W3w + K[76]);

	W0x += Wr1(W3z) + W2y + Wr2(W0y);
	RND(A,B,C,D,E,F,G,H, W0x + K[15]);
	W0y += Wr1(W3w) + W2z + Wr2(W0z);
	RND(H,A,B,C,D,E,F,G, W0y + K[16]);
	W0z += Wr1(W0x) + W2w + Wr2(W0w);
	RND(G,H,A,B,C,D,E,F, W0z + K[17]);
	W0w += Wr1(W0y) + W3x + Wr2(W1x);
	RND(F,G,H,A,B,C,D,E, W0w + K[18]);

	W1x += Wr1(W0z) + W3y + Wr2(W1y);
	RND(E,F,G,H,A,B,C,D, W1x + K[19]);
	W1y += Wr1(W0w) + W3z + Wr2(W1z);
	RND(D,E,F,G,H,A,B,C, W1y + K[20]);
	W1z += Wr1(W1x) + W3w + Wr2(W1w);
	RND(C,D,E,F,G,H,A,B, W1z + K[21]);
	W1w += Wr1(W1y) + W0x + Wr2(W2x);
	RND(B,C,D,E,F,G,H,A, W1w + K[22]);

	W2x += Wr1(W1z) + W0y + Wr2(W2y);
	RND(A,B,C,D,E,F,G,H, W2x + K[23]);
	W2y += Wr1(W1w) + W0z + Wr2(W2z);
	RND(H,A,B,C,D,E,F,G, W2y + K[24]);
	W2z += Wr1(W2x) + W0w + Wr2(W2w);
	RND(G,H,A,B,C,D,E,F, W2z + K[25]);
	W2w += Wr1(W2y) + W1x + Wr2(W3x);
	RND(F,G,H,A,B,C,D,E, W2w + K[26]);

	W3x += Wr1(W2z) + W1y + Wr2(W3y);
	RND(E,F,G,H,A,B,C,D, W3x + K[27]);
	W3y += Wr1(W2w) + W1z + Wr2(W3z);
	RND(D,E,F,G,H,A,B,C, W3y + K[28]);
	W3z += Wr1(W3x) + W1w + Wr2(W3w);
	RND(C,D,E,F,G,H,A,B, W3z + K[29]);
	W3w += Wr1(W3y) + W2x + Wr2(W0x);
	RND(B,C,D,E,F,G,H,A, W3w + K[30]);

	W0x += Wr1(W3z) + W2y + Wr2(W0y);
	RND(A,B,C,D,E,F,G,H, W0x + K[31]);
	W0y += Wr1(W3w) + W2z + Wr2(W0z);
	RND(H,A,B,C,D,E,F,G, W0y + K[32]);
	W0z += Wr1(W0x) + W2w + Wr2(W0w);
	RND(G,H,A,B,C,D,E,F, W0z + K[33]);
	W0w += Wr1(W0y) + W3x + Wr2(W1x);
	RND(F,G,H,A,B,C,D,E, W0w + K[34]);

	W1x += Wr1(W0z) + W3y + Wr2(W1y);
	RND(E,F,G,H,A,B,C,D, W1x + K[35]);
	W1y += Wr1(W0w) + W3z + Wr2(W1z);
	RND(D,E,F,G,H,A,B,C, W1y + K[36]);
	W1z += Wr1(W1x) + W3w + Wr2(W1w);
	RND(C,D,E,F,G,H,A,B, W1z + K[37]);
	W1w += Wr1(W1y) + W0x + Wr2(W2x);
	RND(B,C,D,E,F,G,H,A, W1w + K[38]);

	W2x += Wr1(W1z) + W0y + Wr2(W2y);
	RND(A,B,C,D,E,F,G,H, W2x + K[39]);
	W2y += Wr1(W1w) + W0z + Wr2(W2z);
	RND(H,A,B,C,D,E,F,G, W2y + K[40]);
	W2z += Wr1(W2x) + W0w + Wr2(W2w);
	RND(G,H,A,B,C,D,E,F, W2z + K[41]);
	W2w += Wr1(W2y) + W1x + Wr2(W3x);
	RND(F,G,H,A,B,C,D,E, W2w + K[42]);

	W3x += Wr1(W2z) + W1y + Wr2(W3y);
	RND(E,F,G,H,A,B,C,D, W3x + K[43]);
	W3y += Wr1(W2w) + W1z + Wr2(W3z);
	RND(D,E,F,G,H,A,B,C, W3y + K[44]);
	W3z += Wr1(W3x) + W1w + Wr2(W3w);
	RND(C,D,E,F,G,H,A,B, W3z + K[45]);
	W3w += Wr1(W3y) + W2x + Wr2(W0x);
	RND(B,C,D,E,F,G,H,A, W3w + K[46]);

	W0x += Wr1(W3z) + W2y + Wr2(W0y);
	RND(A,B,C,D,E,F,G,H, W0x + K[47]);
	W0y += Wr1(W3w) + W2z + Wr2(W0z);
	RND(H,A,B,C,D,E,F,G, W0y + K[48]);
	W0z += Wr1(W0x) + W2w + Wr2(W0w);
	RND(G,H,A,B,C,D,E,F, W0z + K[49]);
	W0w += Wr1(W0y) + W3x + Wr2(W1x);
	RND(F,G,H,A,B,C,D,E, W0w + K[50]);

	W1x += Wr1(W0z) + W3y + Wr2(W1y);
	RND(E,F,G,H,A,B,C,D, W1x + K[51]);
	W1y += Wr1(W0w) + W3z + Wr2(W1z);
	RND(D,E,F,G,H,A,B,C, W1y + K[52]);
	W1z += Wr1(W1x) + W3w + Wr2(W1w);
	RND(C,D,E,F,G,H,A,B, W1z + K[53]);
	W1w += Wr1(W1y) + W0x + Wr2(W2x);
	RND(B,C,D,E,F,G,H,A, W1w + K[54]);

	W2x += Wr1(W1z) + W0y + Wr2(W2y);
	RND(A,B,C,D,E,F,G,H, W2x + K[55]);
	W2y += Wr1(W1w) + W0z + Wr2(W2z);
	RND(H,A,B,C,D,E,F,G, W2y + K[56]);
	W2z += Wr1(W2x) + W0w + Wr2(W2w);
	RND(G,H,A,B,C,D,E,F, W2z + K[57]);
	W2w += Wr1(W2y) + W1x + Wr2(W3x);
	RND(F,G,H,A,B,C,D,E, W2w + K[58]);

	W3x += Wr1(W2z) + W1y + Wr2(W3y);
	RND(E,F,G,H,A,B,C,D, W3x + K[59]);
	W3y += Wr1(W2w) + W1z + Wr2(W3z);
	RND(D,E,F,G,H,A,B,C, W3y + K[60]);
	W3z += Wr1(W3x) + W1w + Wr2(W3w);
	RND(C,D,E,F,G,H,A,B, W3z + K[61]);
	W3w += Wr1(W3y) + W2x + Wr2(W0x);
	RND(B,C,D,E,F,G,H,A, W3w + K[62]);
	
    state[0] += A;
    state[1] += B;
    state[2] += C;
    state[3] += D;
    state[4] += E;
    state[5] += F;
    state[6] += G;
    state[7] += H;
}

