/*-
 * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt,
 * 2012-2013 Con Kolivas.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * This file was originally written by Colin Percival as part of the Tarsnap
 * online backup system.
 */

/* N (nfactor), CPU/Memory cost parameter */



#define E0 0x00FF00FF
#define E1 0xFF00FF00

#define rotl(x,y) rotate(x,y)
#define Ch(x,y,z) bitselect(z,y,x)
#define Maj(x,y,z) Ch((x^z),y,z)

#define EndianSwapa(n) (Ch(E0, rotl(n, 8U), rotl(n, 24U)))
#define EndianSwapb(n) (rotl(n & E0, 24U)|rotl(n & E1, 8U))

#define Tr2(x)		(rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U))
#define Tr1(x)		(rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U))
#define Wr2(x)		(rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U))
#define Wr1(x)		(rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U))

#define RND(a, b, c, d, e, f, g, h, k)	\
	h += Tr1(e); 			\
	h += Ch(e, f, g); 		\
	h += k;				\
	d += h;				\
	h += Tr2(a); 			\
	h += Maj(a, b, c);

void uintp_to_uint4p(uint4 *dest, uint *source){
    uint4 tmp;
    tmp.x = source[0];
    tmp.y = source[1];
    tmp.z = source[2];
    tmp.w = source[3];
    *dest = tmp;
}

void uint4p_to_uintp(uint *dest, uint4 *source){
    uint4 tmp = *source;;
        dest[0] = tmp.x;
        dest[1] = tmp.y;
        dest[2] = tmp.z;
        dest[3] = tmp.w;
}

void uint4_to_uintp(uint *dest, uint4 source){
        dest[0] = source.x;
        dest[1] = source.y;
        dest[2] = source.z;
        dest[3] = source.w;
}

void SHA256(uint*restrict state, uint*restrict block){
    uint A = state[0];
    uint B = state[1];
    uint C = state[2];
    uint D = state[3];
    uint E = state[4];
    uint F = state[5];
    uint G = state[6];
    uint H = state[7];
	uint W0x = block[0];
	uint W0y = block[1];
	uint W0z = block[2];
	uint W0w = block[3];
    uint W1x = block[4];
    uint W1y = block[5];
    uint W1z = block[6];
    uint W1w = block[7];
    uint W2x = block[8];
    uint W2y = block[9];
    uint W2z = block[10];
    uint W2w = block[11];
    uint W3x = block[12];
    uint W3y = block[13];
    uint W3z = block[14];
    uint W3w = block[15];

	RND(A,B,C,D,E,F,G,H, W0x + K[0]);
	RND(H,A,B,C,D,E,F,G, W0y + K[1]);
	RND(G,H,A,B,C,D,E,F, W0z + K[2]);
	RND(F,G,H,A,B,C,D,E, W0w + K[3]);

	RND(E,F,G,H,A,B,C,D, W1x + K[4]);
	RND(D,E,F,G,H,A,B,C, W1y + K[5]);
	RND(C,D,E,F,G,H,A,B, W1z + K[6]);
	RND(B,C,D,E,F,G,H,A, W1w + K[7]);

	RND(A,B,C,D,E,F,G,H, W2x + K[8]);
	RND(H,A,B,C,D,E,F,G, W2y + K[9]);
	RND(G,H,A,B,C,D,E,F, W2z + K[10]);
	RND(F,G,H,A,B,C,D,E, W2w + K[11]);

	RND(E,F,G,H,A,B,C,D, W3x + K[12]);
	RND(D,E,F,G,H,A,B,C, W3y + K[13]);
	RND(C,D,E,F,G,H,A,B, W3z + K[14]);
	RND(B,C,D,E,F,G,H,A, W3w + K[76]);

	W0x += Wr1(W3z) + W2y + Wr2(W0y);
	RND(A,B,C,D,E,F,G,H, W0x + K[15]);
	W0y += Wr1(W3w) + W2z + Wr2(W0z);
	RND(H,A,B,C,D,E,F,G, W0y + K[16]);
	W0z += Wr1(W0x) + W2w + Wr2(W0w);
	RND(G,H,A,B,C,D,E,F, W0z + K[17]);
	W0w += Wr1(W0y) + W3x + Wr2(W1x);
	RND(F,G,H,A,B,C,D,E, W0w + K[18]);

	W1x += Wr1(W0z) + W3y + Wr2(W1y);
	RND(E,F,G,H,A,B,C,D, W1x + K[19]);
	W1y += Wr1(W0w) + W3z + Wr2(W1z);
	RND(D,E,F,G,H,A,B,C, W1y + K[20]);
	W1z += Wr1(W1x) + W3w + Wr2(W1w);
	RND(C,D,E,F,G,H,A,B, W1z + K[21]);
	W1w += Wr1(W1y) + W0x + Wr2(W2x);
	RND(B,C,D,E,F,G,H,A, W1w + K[22]);

	W2x += Wr1(W1z) + W0y + Wr2(W2y);
	RND(A,B,C,D,E,F,G,H, W2x + K[23]);
	W2y += Wr1(W1w) + W0z + Wr2(W2z);
	RND(H,A,B,C,D,E,F,G, W2y + K[24]);
	W2z += Wr1(W2x) + W0w + Wr2(W2w);
	RND(G,H,A,B,C,D,E,F, W2z + K[25]);
	W2w += Wr1(W2y) + W1x + Wr2(W3x);
	RND(F,G,H,A,B,C,D,E, W2w + K[26]);

	W3x += Wr1(W2z) + W1y + Wr2(W3y);
	RND(E,F,G,H,A,B,C,D, W3x + K[27]);
	W3y += Wr1(W2w) + W1z + Wr2(W3z);
	RND(D,E,F,G,H,A,B,C, W3y + K[28]);
	W3z += Wr1(W3x) + W1w + Wr2(W3w);
	RND(C,D,E,F,G,H,A,B, W3z + K[29]);
	W3w += Wr1(W3y) + W2x + Wr2(W0x);
	RND(B,C,D,E,F,G,H,A, W3w + K[30]);

	W0x += Wr1(W3z) + W2y + Wr2(W0y);
	RND(A,B,C,D,E,F,G,H, W0x + K[31]);
	W0y += Wr1(W3w) + W2z + Wr2(W0z);
	RND(H,A,B,C,D,E,F,G, W0y + K[32]);
	W0z += Wr1(W0x) + W2w + Wr2(W0w);
	RND(G,H,A,B,C,D,E,F, W0z + K[33]);
	W0w += Wr1(W0y) + W3x + Wr2(W1x);
	RND(F,G,H,A,B,C,D,E, W0w + K[34]);

	W1x += Wr1(W0z) + W3y + Wr2(W1y);
	RND(E,F,G,H,A,B,C,D, W1x + K[35]);
	W1y += Wr1(W0w) + W3z + Wr2(W1z);
	RND(D,E,F,G,H,A,B,C, W1y + K[36]);
	W1z += Wr1(W1x) + W3w + Wr2(W1w);
	RND(C,D,E,F,G,H,A,B, W1z + K[37]);
	W1w += Wr1(W1y) + W0x + Wr2(W2x);
	RND(B,C,D,E,F,G,H,A, W1w + K[38]);

	W2x += Wr1(W1z) + W0y + Wr2(W2y);
	RND(A,B,C,D,E,F,G,H, W2x + K[39]);
	W2y += Wr1(W1w) + W0z + Wr2(W2z);
	RND(H,A,B,C,D,E,F,G, W2y + K[40]);
	W2z += Wr1(W2x) + W0w + Wr2(W2w);
	RND(G,H,A,B,C,D,E,F, W2z + K[41]);
	W2w += Wr1(W2y) + W1x + Wr2(W3x);
	RND(F,G,H,A,B,C,D,E, W2w + K[42]);

	W3x += Wr1(W2z) + W1y + Wr2(W3y);
	RND(E,F,G,H,A,B,C,D, W3x + K[43]);
	W3y += Wr1(W2w) + W1z + Wr2(W3z);
	RND(D,E,F,G,H,A,B,C, W3y + K[44]);
	W3z += Wr1(W3x) + W1w + Wr2(W3w);
	RND(C,D,E,F,G,H,A,B, W3z + K[45]);
	W3w += Wr1(W3y) + W2x + Wr2(W0x);
	RND(B,C,D,E,F,G,H,A, W3w + K[46]);

	W0x += Wr1(W3z) + W2y + Wr2(W0y);
	RND(A,B,C,D,E,F,G,H, W0x + K[47]);
	W0y += Wr1(W3w) + W2z + Wr2(W0z);
	RND(H,A,B,C,D,E,F,G, W0y + K[48]);
	W0z += Wr1(W0x) + W2w + Wr2(W0w);
	RND(G,H,A,B,C,D,E,F, W0z + K[49]);
	W0w += Wr1(W0y) + W3x + Wr2(W1x);
	RND(F,G,H,A,B,C,D,E, W0w + K[50]);

	W1x += Wr1(W0z) + W3y + Wr2(W1y);
	RND(E,F,G,H,A,B,C,D, W1x + K[51]);
	W1y += Wr1(W0w) + W3z + Wr2(W1z);
	RND(D,E,F,G,H,A,B,C, W1y + K[52]);
	W1z += Wr1(W1x) + W3w + Wr2(W1w);
	RND(C,D,E,F,G,H,A,B, W1z + K[53]);
	W1w += Wr1(W1y) + W0x + Wr2(W2x);
	RND(B,C,D,E,F,G,H,A, W1w + K[54]);

	W2x += Wr1(W1z) + W0y + Wr2(W2y);
	RND(A,B,C,D,E,F,G,H, W2x + K[55]);
	W2y += Wr1(W1w) + W0z + Wr2(W2z);
	RND(H,A,B,C,D,E,F,G, W2y + K[56]);
	W2z += Wr1(W2x) + W0w + Wr2(W2w);
	RND(G,H,A,B,C,D,E,F, W2z + K[57]);
	W2w += Wr1(W2y) + W1x + Wr2(W3x);
	RND(F,G,H,A,B,C,D,E, W2w + K[58]);

	W3x += Wr1(W2z) + W1y + Wr2(W3y);
	RND(E,F,G,H,A,B,C,D, W3x + K[59]);
	W3y += Wr1(W2w) + W1z + Wr2(W3z);
	RND(D,E,F,G,H,A,B,C, W3y + K[60]);
	W3z += Wr1(W3x) + W1w + Wr2(W3w);
	RND(C,D,E,F,G,H,A,B, W3z + K[61]);
	W3w += Wr1(W3y) + W2x + Wr2(W0x);
	RND(B,C,D,E,F,G,H,A, W3w + K[62]);
	
    state[0] += A;
    state[1] += B;
    state[2] += C;
    state[3] += D;
    state[4] += E;
    state[5] += F;
    state[6] += G;
    state[7] += H;
}

