#ifndef MEMIO
#define MEMIO 1

#define CLSIZE 64


#if (CLSIZE == 64)
typedef uint16 CLO;
#define CLM 4
#elif (CLSIZE == 32)
typedef uint8 CLO;
#define CLM 8
#elif (CLSIZE == 16)
typedef uint4;
#define CLM 16
#else
typedef uint CLO;
#define CLM 32
#endif



/*
x*(z ## SIZE) +

y*(x ## SIZE)*(z ## SIZE)

+ z
*/

//y*(x ## SIZE)*(z ## SIZE)

#define CO_00 (x + ystep + z)
#define CO_01 (x + ystep + 1)
#define CO_02 (x + ystep + 2)
#define CO_03 (x + ystep + 3)
#define CO_04 (x + ystep + 4)
#define CO_05 (x + ystep + 5)
#define CO_06 (x + ystep + 6)
#define CO_07 (x + ystep + 7)
#define CO_08 (x + ystep + 8)
#define CO_09 (x + ystep + 9)
#define CO_10 (x + ystep + 10)
#define CO_11 (x + ystep + 11)
#define CO_12 (x + ystep + 12)
#define CO_13 (x + ystep + 13)
#define CO_14 (x + ystep + 14)
#define CO_15 (x + ystep + 15)
#define CO_16 (x + ystep + 16)



//16 WRITES
#define 16WRITE_64s0 \
	line0 = (uint16)(X[0].s0, X[1].s0, X[2].s0, X[3].s0, X[4].s0, X[5].s0, X[6].s0, X[7].s0, X[8].s0, X[9].s0, X[10].s0, X[11].s0, X[12].s0, X[13].s0, X[14].s0, X[15].s0); \
	line1 = (uint16)(X[16].s0, X[17].s0, X[18].s0, X[19].s0, X[20].s0, X[21].s0, X[22].s0, X[23].s0, X[24].s0, X[25].s0, X[26].s0, X[27].s0, X[28].s0, X[29].s0, X[30].s0, X[31].s0);
#endif

#define 16WRITE_64s1 \
	line0 = (uint16)(X[0].s1, X[1].s1, X[2].s1, X[3].s1, X[4].s1, X[5].s1, X[6].s1, X[7].s1, X[8].s1, X[9].s1, X[10].s1, X[11].s1, X[12].s1, X[13].s1, X[14].s1, X[15].s1); \
	line1 = (uint16)(X[16].s1, X[17].s1, X[18].s1, X[19].s1, X[20].s1, X[21].s1, X[22].s1, X[23].s1, X[24].s1, X[25].s1, X[26].s1, X[27].s1, X[28].s1, X[29].s1, X[30].s1, X[31].s1);
#endif

#define 16WRITE_64s2 \
	line0 = (uint16)(X[0].s2, X[1].s2, X[2].s2, X[3].s2, X[4].s2, X[5].s2, X[6].s2, X[7].s2, X[8].s2, X[9].s2, X[10].s2, X[11].s2, X[12].s2, X[13].s2, X[14].s2, X[15].s2); \
	line1 = (uint16)(X[16].s2, X[17].s2, X[18].s2, X[19].s2, X[20].s2, X[21].s2, X[22].s2, X[23].s2, X[24].s2, X[25].s2, X[26].s2, X[27].s2, X[28].s2, X[29].s2, X[30].s2, X[31].s2);
#endif

#define 16WRITE_64s3 \
	line0 = (uint16)(X[0].s3, X[1].s3, X[2].s3, X[3].s3, X[4].s3, X[5].s3, X[6].s3, X[7].s3, X[8].s3, X[9].s3, X[10].s3, X[11].s3, X[12].s3, X[13].s3, X[14].s3, X[15].s3); \
	line1 = (uint16)(X[16].s3, X[17].s3, X[18].s3, X[19].s3, X[20].s3, X[21].s3, X[22].s3, X[23].s3, X[24].s3, X[25].s3, X[26].s3, X[27].s3, X[28].s3, X[29].s3, X[30].s3, X[31].s3);
#endif

#define 16WRITE_64s4 \
	line0 = (uint16)(X[0].s4, X[1].s4, X[2].s4, X[3].s4, X[4].s4, X[5].s4, X[6].s4, X[7].s4, X[8].s4, X[9].s4, X[10].s4, X[11].s4, X[12].s4, X[13].s4, X[14].s4, X[15].s4); \
	line1 = (uint16)(X[16].s4, X[17].s4, X[18].s4, X[19].s4, X[20].s4, X[21].s4, X[22].s4, X[23].s4, X[24].s4, X[25].s4, X[26].s4, X[27].s4, X[28].s4, X[29].s4, X[30].s4, X[31].s4);
#endif

#define 16WRITE_64s5 \
	line0 = (uint16)(X[0].s5, X[1].s5, X[2].s5, X[3].s5, X[4].s5, X[5].s5, X[6].s5, X[7].s5, X[8].s5, X[9].s5, X[10].s5, X[11].s5, X[12].s5, X[13].s5, X[14].s5, X[15].s5); \
	line1 = (uint16)(X[16].s5, X[17].s5, X[18].s5, X[19].s5, X[20].s5, X[21].s5, X[22].s5, X[23].s5, X[24].s5, X[25].s5, X[26].s5, X[27].s5, X[28].s5, X[29].s5, X[30].s5, X[31].s5);
#endif

#define 16WRITE_64s6 \
	line0 = (uint16)(X[0].s6, X[1].s6, X[2].s6, X[3].s6, X[4].s6, X[5].s6, X[6].s6, X[7].s6, X[8].s6, X[9].s6, X[10].s6, X[11].s6, X[12].s6, X[13].s6, X[14].s6, X[15].s6); \
	line1 = (uint16)(X[16].s6, X[17].s6, X[18].s6, X[19].s6, X[20].s6, X[21].s6, X[22].s6, X[23].s6, X[24].s6, X[25].s6, X[26].s6, X[27].s6, X[28].s6, X[29].s6, X[30].s6, X[31].s6);
#endif

#define 16WRITE_64s7 \
	line0 = (uint16)(X[0].s7, X[1].s7, X[2].s7, X[3].s7, X[4].s7, X[5].s7, X[6].s7, X[7].s7, X[8].s7, X[9].s7, X[10].s7, X[11].s7, X[12].s7, X[13].s7, X[14].s7, X[15].s7); \
	line1 = (uint16)(X[16].s7, X[17].s7, X[18].s7, X[19].s7, X[20].s7, X[21].s7, X[22].s7, X[23].s7, X[24].s7, X[25].s7, X[26].s7, X[27].s7, X[28].s7, X[29].s7, X[30].s7, X[31].s7);
#endif

#define 16WRITE_64s8 \
	line0 = (uint16)(X[0].s8, X[1].s8, X[2].s8, X[3].s8, X[4].s8, X[5].s8, X[6].s8, X[7].s8, X[8].s8, X[9].s8, X[10].s8, X[11].s8, X[12].s8, X[13].s8, X[14].s8, X[15].s8); \
	line1 = (uint16)(X[16].s8, X[17].s8, X[18].s8, X[19].s8, X[20].s8, X[21].s8, X[22].s8, X[23].s8, X[24].s8, X[25].s8, X[26].s8, X[27].s8, X[28].s8, X[29].s8, X[30].s8, X[31].s8);
#endif

#define 16WRITE_64s9 \
	line0 = (uint16)(X[0].s9, X[1].s9, X[2].s9, X[3].s9, X[4].s9, X[5].s9, X[6].s9, X[7].s9, X[8].s9, X[9].s9, X[10].s9, X[11].s9, X[12].s9, X[13].s9, X[14].s9, X[15].s9); \
	line1 = (uint16)(X[16].s9, X[17].s9, X[18].s9, X[19].s9, X[20].s9, X[21].s9, X[22].s9, X[23].s9, X[24].s9, X[25].s9, X[26].s9, X[27].s9, X[28].s9, X[29].s9, X[30].s9, X[31].s9);
#endif

#define 16WRITE_64sa \
	line0 = (uint16)(X[0].sa, X[1].sa, X[2].sa, X[3].sa, X[4].sa, X[5].sa, X[6].sa, X[7].sa, X[8].sa, X[9].sa, X[10].sa, X[11].sa, X[12].sa, X[13].sa, X[14].sa, X[15].sa); \
	line1 = (uint16)(X[16].sa, X[17].sa, X[18].sa, X[19].sa, X[20].sa, X[21].sa, X[22].sa, X[23].sa, X[24].sa, X[25].sa, X[26].sa, X[27].sa, X[28].sa, X[29].sa, X[30].sa, X[31].sa);
#endif

#define 16WRITE_64sb \
	line0 = (uint16)(X[0].sb, X[1].sb, X[2].sb, X[3].sb, X[4].sb, X[5].sb, X[6].sb, X[7].sb, X[8].sb, X[9].sb, X[10].sb, X[11].sb, X[12].sb, X[13].sb, X[14].sb, X[15].sb); \
	line1 = (uint16)(X[16].sb, X[17].sb, X[18].sb, X[19].sb, X[20].sb, X[21].sb, X[22].sb, X[23].sb, X[24].sb, X[25].sb, X[26].sb, X[27].sb, X[28].sb, X[29].sb, X[30].sb, X[31].sb);
#endif

#define 16WRITE_64sc \
	line0 = (uint16)(X[0].sc, X[1].sc, X[2].sc, X[3].sc, X[4].sc, X[5].sc, X[6].sc, X[7].sc, X[8].sc, X[9].sc, X[10].sc, X[11].sc, X[12].sc, X[13].sc, X[14].sc, X[15].sc); \
	line1 = (uint16)(X[16].sc, X[17].sc, X[18].sc, X[19].sc, X[20].sc, X[21].sc, X[22].sc, X[23].sc, X[24].sc, X[25].sc, X[26].sc, X[27].sc, X[28].sc, X[29].sc, X[30].sc, X[31].sc);
#endif

#define 16WRITE_64sd \
	line0 = (uint16)(X[0].sd, X[1].sd, X[2].sd, X[3].sd, X[4].sd, X[5].sd, X[6].sd, X[7].sd, X[8].sd, X[9].sd, X[10].sd, X[11].sd, X[12].sd, X[13].sd, X[14].sd, X[15].sd); \
	line1 = (uint16)(X[16].sd, X[17].sd, X[18].sd, X[19].sd, X[20].sd, X[21].sd, X[22].sd, X[23].sd, X[24].sd, X[25].sd, X[26].sd, X[27].sd, X[28].sd, X[29].sd, X[30].sd, X[31].sd);
#endif

#define 16WRITE_64se \
	line0 = (uint16)(X[0].se, X[1].se, X[2].se, X[3].se, X[4].se, X[5].se, X[6].se, X[7].se, X[8].se, X[9].se, X[10].se, X[11].se, X[12].se, X[13].se, X[14].se, X[15].se); \
	line1 = (uint16)(X[16].se, X[17].se, X[18].se, X[19].se, X[20].se, X[21].se, X[22].se, X[23].se, X[24].se, X[25].se, X[26].se, X[27].se, X[28].se, X[29].se, X[30].se, X[31].se);
#endif

#define 16WRITE_64sf \
	line0 = (uint16)(X[0].sf, X[1].sf, X[2].sf, X[3].sf, X[4].sf, X[5].sf, X[6].sf, X[7].sf, X[8].sf, X[9].sf, X[10].sf, X[11].sf, X[12].sf, X[13].sf, X[14].sf, X[15].sf); \
	line1 = (uint16)(X[16].sf, X[17].sf, X[18].sf, X[19].sf, X[20].sf, X[21].sf, X[22].sf, X[23].sf, X[24].sf, X[25].sf, X[26].sf, X[27].sf, X[28].sf, X[29].sf, X[30].sf, X[31].sf);
#endif

#define 16WRITE_32s0 \
	line0 = (uint8)(X[0].s0, X[1].s0, X[2].s0, X[3].s0, X[4].s0, X[5].s0, X[6].s0, X[7].s0); \
	line1 = (uint8)(X[8].s0, X[9].s0, X[10].s0, X[11].s0, X[12].s0, X[13].s0, X[14].s0, X[15].s0); \
	line2 = (uint8)(X[16].s0, X[17].s0, X[18].s0, X[19].s0, X[20].s0, X[21].s0, X[22].s0, X[23].s0); \
	line3 = (uint8)(X[24].s0, X[25].s0, X[26].s0, X[27].s0, X[28].s0, X[29].s0, X[30].s0, X[31].s0);
#endif

#define 16WRITE_32s1 \
	line0 = (uint8)(X[0].s1, X[1].s1, X[2].s1, X[3].s1, X[4].s1, X[5].s1, X[6].s1, X[7].s1); \
	line1 = (uint8)(X[8].s1, X[9].s1, X[10].s1, X[11].s1, X[12].s1, X[13].s1, X[14].s1, X[15].s1); \
	line2 = (uint8)(X[16].s1, X[17].s1, X[18].s1, X[19].s1, X[20].s1, X[21].s1, X[22].s1, X[23].s1); \
	line3 = (uint8)(X[24].s1, X[25].s1, X[26].s1, X[27].s1, X[28].s1, X[29].s1, X[30].s1, X[31].s1);
#endif

#define 16WRITE_32s2 \
	line0 = (uint8)(X[0].s2, X[1].s2, X[2].s2, X[3].s2, X[4].s2, X[5].s2, X[6].s2, X[7].s2); \
	line1 = (uint8)(X[8].s2, X[9].s2, X[10].s2, X[11].s2, X[12].s2, X[13].s2, X[14].s2, X[15].s2); \
	line2 = (uint8)(X[16].s2, X[17].s2, X[18].s2, X[19].s2, X[20].s2, X[21].s2, X[22].s2, X[23].s2); \
	line3 = (uint8)(X[24].s2, X[25].s2, X[26].s2, X[27].s2, X[28].s2, X[29].s2, X[30].s2, X[31].s2);
#endif

#define 16WRITE_32s3 \
	line0 = (uint8)(X[0].s3, X[1].s3, X[2].s3, X[3].s3, X[4].s3, X[5].s3, X[6].s3, X[7].s3); \
	line1 = (uint8)(X[8].s3, X[9].s3, X[10].s3, X[11].s3, X[12].s3, X[13].s3, X[14].s3, X[15].s3); \
	line2 = (uint8)(X[16].s3, X[17].s3, X[18].s3, X[19].s3, X[20].s3, X[21].s3, X[22].s3, X[23].s3); \
	line3 = (uint8)(X[24].s3, X[25].s3, X[26].s3, X[27].s3, X[28].s3, X[29].s3, X[30].s3, X[31].s3);
#endif

#define 16WRITE_32s4 \
	line0 = (uint8)(X[0].s4, X[1].s4, X[2].s4, X[3].s4, X[4].s4, X[5].s4, X[6].s4, X[7].s4); \
	line1 = (uint8)(X[8].s4, X[9].s4, X[10].s4, X[11].s4, X[12].s4, X[13].s4, X[14].s4, X[15].s4); \
	line2 = (uint8)(X[16].s4, X[17].s4, X[18].s4, X[19].s4, X[20].s4, X[21].s4, X[22].s4, X[23].s4); \
	line3 = (uint8)(X[24].s4, X[25].s4, X[26].s4, X[27].s4, X[28].s4, X[29].s4, X[30].s4, X[31].s4);
#endif

#define 16WRITE_32s5 \
	line0 = (uint8)(X[0].s5, X[1].s5, X[2].s5, X[3].s5, X[4].s5, X[5].s5, X[6].s5, X[7].s5); \
	line1 = (uint8)(X[8].s5, X[9].s5, X[10].s5, X[11].s5, X[12].s5, X[13].s5, X[14].s5, X[15].s5); \
	line2 = (uint8)(X[16].s5, X[17].s5, X[18].s5, X[19].s5, X[20].s5, X[21].s5, X[22].s5, X[23].s5); \
	line3 = (uint8)(X[24].s5, X[25].s5, X[26].s5, X[27].s5, X[28].s5, X[29].s5, X[30].s5, X[31].s5);
#endif

#define 16WRITE_32s6 \
	line0 = (uint8)(X[0].s6, X[1].s6, X[2].s6, X[3].s6, X[4].s6, X[5].s6, X[6].s6, X[7].s6); \
	line1 = (uint8)(X[8].s6, X[9].s6, X[10].s6, X[11].s6, X[12].s6, X[13].s6, X[14].s6, X[15].s6); \
	line2 = (uint8)(X[16].s6, X[17].s6, X[18].s6, X[19].s6, X[20].s6, X[21].s6, X[22].s6, X[23].s6); \
	line3 = (uint8)(X[24].s6, X[25].s6, X[26].s6, X[27].s6, X[28].s6, X[29].s6, X[30].s6, X[31].s6);
#endif

#define 16WRITE_32s7 \
	line0 = (uint8)(X[0].s7, X[1].s7, X[2].s7, X[3].s7, X[4].s7, X[5].s7, X[6].s7, X[7].s7); \
	line1 = (uint8)(X[8].s7, X[9].s7, X[10].s7, X[11].s7, X[12].s7, X[13].s7, X[14].s7, X[15].s7); \
	line2 = (uint8)(X[16].s7, X[17].s7, X[18].s7, X[19].s7, X[20].s7, X[21].s7, X[22].s7, X[23].s7); \
	line3 = (uint8)(X[24].s7, X[25].s7, X[26].s7, X[27].s7, X[28].s7, X[29].s7, X[30].s7, X[31].s7);
#endif

#define 16WRITE_32s8 \
	line0 = (uint8)(X[0].s8, X[1].s8, X[2].s8, X[3].s8, X[4].s8, X[5].s8, X[6].s8, X[7].s8); \
	line1 = (uint8)(X[8].s8, X[9].s8, X[10].s8, X[11].s8, X[12].s8, X[13].s8, X[14].s8, X[15].s8); \
	line2 = (uint8)(X[16].s8, X[17].s8, X[18].s8, X[19].s8, X[20].s8, X[21].s8, X[22].s8, X[23].s8); \
	line3 = (uint8)(X[24].s8, X[25].s8, X[26].s8, X[27].s8, X[28].s8, X[29].s8, X[30].s8, X[31].s8);
#endif

#define 16WRITE_32s9 \
	line0 = (uint8)(X[0].s9, X[1].s9, X[2].s9, X[3].s9, X[4].s9, X[5].s9, X[6].s9, X[7].s9); \
	line1 = (uint8)(X[8].s9, X[9].s9, X[10].s9, X[11].s9, X[12].s9, X[13].s9, X[14].s9, X[15].s9); \
	line2 = (uint8)(X[16].s9, X[17].s9, X[18].s9, X[19].s9, X[20].s9, X[21].s9, X[22].s9, X[23].s9); \
	line3 = (uint8)(X[24].s9, X[25].s9, X[26].s9, X[27].s9, X[28].s9, X[29].s9, X[30].s9, X[31].s9);
#endif

#define 16WRITE_32sa \
	line0 = (uint8)(X[0].sa, X[1].sa, X[2].sa, X[3].sa, X[4].sa, X[5].sa, X[6].sa, X[7].sa); \
	line1 = (uint8)(X[8].sa, X[9].sa, X[10].sa, X[11].sa, X[12].sa, X[13].sa, X[14].sa, X[15].sa); \
	line2 = (uint8)(X[16].sa, X[17].sa, X[18].sa, X[19].sa, X[20].sa, X[21].sa, X[22].sa, X[23].sa); \
	line3 = (uint8)(X[24].sa, X[25].sa, X[26].sa, X[27].sa, X[28].sa, X[29].sa, X[30].sa, X[31].sa);
#endif

#define 16WRITE_32sb \
	line0 = (uint8)(X[0].sb, X[1].sb, X[2].sb, X[3].sb, X[4].sb, X[5].sb, X[6].sb, X[7].sb); \
	line1 = (uint8)(X[8].sb, X[9].sb, X[10].sb, X[11].sb, X[12].sb, X[13].sb, X[14].sb, X[15].sb); \
	line2 = (uint8)(X[16].sb, X[17].sb, X[18].sb, X[19].sb, X[20].sb, X[21].sb, X[22].sb, X[23].sb); \
	line3 = (uint8)(X[24].sb, X[25].sb, X[26].sb, X[27].sb, X[28].sb, X[29].sb, X[30].sb, X[31].sb);
#endif

#define 16WRITE_32sc \
	line0 = (uint8)(X[0].sc, X[1].sc, X[2].sc, X[3].sc, X[4].sc, X[5].sc, X[6].sc, X[7].sc); \
	line1 = (uint8)(X[8].sc, X[9].sc, X[10].sc, X[11].sc, X[12].sc, X[13].sc, X[14].sc, X[15].sc); \
	line2 = (uint8)(X[16].sc, X[17].sc, X[18].sc, X[19].sc, X[20].sc, X[21].sc, X[22].sc, X[23].sc); \
	line3 = (uint8)(X[24].sc, X[25].sc, X[26].sc, X[27].sc, X[28].sc, X[29].sc, X[30].sc, X[31].sc);
#endif

#define 16WRITE_32sd \
	line0 = (uint8)(X[0].sd, X[1].sd, X[2].sd, X[3].sd, X[4].sd, X[5].sd, X[6].sd, X[7].sd); \
	line1 = (uint8)(X[8].sd, X[9].sd, X[10].sd, X[11].sd, X[12].sd, X[13].sd, X[14].sd, X[15].sd); \
	line2 = (uint8)(X[16].sd, X[17].sd, X[18].sd, X[19].sd, X[20].sd, X[21].sd, X[22].sd, X[23].sd); \
	line3 = (uint8)(X[24].sd, X[25].sd, X[26].sd, X[27].sd, X[28].sd, X[29].sd, X[30].sd, X[31].sd);
#endif

#define 16WRITE_32se \
	line0 = (uint8)(X[0].se, X[1].se, X[2].se, X[3].se, X[4].se, X[5].se, X[6].se, X[7].se); \
	line1 = (uint8)(X[8].se, X[9].se, X[10].se, X[11].se, X[12].se, X[13].se, X[14].se, X[15].se); \
	line2 = (uint8)(X[16].se, X[17].se, X[18].se, X[19].se, X[20].se, X[21].se, X[22].se, X[23].se); \
	line3 = (uint8)(X[24].se, X[25].se, X[26].se, X[27].se, X[28].se, X[29].se, X[30].se, X[31].se);
#endif

#define 16WRITE_32sf \
	line0 = (uint8)(X[0].sf, X[1].sf, X[2].sf, X[3].sf, X[4].sf, X[5].sf, X[6].sf, X[7].sf); \
	line1 = (uint8)(X[8].sf, X[9].sf, X[10].sf, X[11].sf, X[12].sf, X[13].sf, X[14].sf, X[15].sf); \
	line2 = (uint8)(X[16].sf, X[17].sf, X[18].sf, X[19].sf, X[20].sf, X[21].sf, X[22].sf, X[23].sf); \
	line3 = (uint8)(X[24].sf, X[25].sf, X[26].sf, X[27].sf, X[28].sf, X[29].sf, X[30].sf, X[31].sf);
#endif

#define 16WRITE_16s0 \
	line0 = (uint4)(X[0].s0, X[1].s0, X[2].s0, X[3].s0); \
	line1 = (uint4)(X[4].s0, X[5].s0, X[6].s0, X[7].s0); \
	line2 = (uint4)(X[8].s0, X[9].s0, X[10].s0, X[11].s0); \
	line3 = (uint4)(X[12].s0, X[13].s0, X[14].s0, X[15].s0); \
	line4 = (uint4)(X[16].s0, X[17].s0, X[18].s0, X[19].s0); \
	line5 = (uint4)(X[20].s0, X[21].s0, X[22].s0, X[23].s0); \
	line6 = (uint4)(X[24].s0, X[25].s0, X[26].s0, X[27].s0); \
	line7 = (uint4)(X[28].s0, X[29].s0, X[30].s0, X[31].s0);
#endif

#define 16WRITE_16s1 \
	line0 = (uint4)(X[0].s1, X[1].s1, X[2].s1, X[3].s1); \
	line1 = (uint4)(X[4].s1, X[5].s1, X[6].s1, X[7].s1); \
	line2 = (uint4)(X[8].s1, X[9].s1, X[10].s1, X[11].s1); \
	line3 = (uint4)(X[12].s1, X[13].s1, X[14].s1, X[15].s1); \
	line4 = (uint4)(X[16].s1, X[17].s1, X[18].s1, X[19].s1); \
	line5 = (uint4)(X[20].s1, X[21].s1, X[22].s1, X[23].s1); \
	line6 = (uint4)(X[24].s1, X[25].s1, X[26].s1, X[27].s1); \
	line7 = (uint4)(X[28].s1, X[29].s1, X[30].s1, X[31].s1);
#endif

#define 16WRITE_16s2 \
	line0 = (uint4)(X[0].s2, X[1].s2, X[2].s2, X[3].s2); \
	line1 = (uint4)(X[4].s2, X[5].s2, X[6].s2, X[7].s2); \
	line2 = (uint4)(X[8].s2, X[9].s2, X[10].s2, X[11].s2); \
	line3 = (uint4)(X[12].s2, X[13].s2, X[14].s2, X[15].s2); \
	line4 = (uint4)(X[16].s2, X[17].s2, X[18].s2, X[19].s2); \
	line5 = (uint4)(X[20].s2, X[21].s2, X[22].s2, X[23].s2); \
	line6 = (uint4)(X[24].s2, X[25].s2, X[26].s2, X[27].s2); \
	line7 = (uint4)(X[28].s2, X[29].s2, X[30].s2, X[31].s2);
#endif

#define 16WRITE_16s3 \
	line0 = (uint4)(X[0].s3, X[1].s3, X[2].s3, X[3].s3); \
	line1 = (uint4)(X[4].s3, X[5].s3, X[6].s3, X[7].s3); \
	line2 = (uint4)(X[8].s3, X[9].s3, X[10].s3, X[11].s3); \
	line3 = (uint4)(X[12].s3, X[13].s3, X[14].s3, X[15].s3); \
	line4 = (uint4)(X[16].s3, X[17].s3, X[18].s3, X[19].s3); \
	line5 = (uint4)(X[20].s3, X[21].s3, X[22].s3, X[23].s3); \
	line6 = (uint4)(X[24].s3, X[25].s3, X[26].s3, X[27].s3); \
	line7 = (uint4)(X[28].s3, X[29].s3, X[30].s3, X[31].s3);
#endif

#define 16WRITE_16s4 \
	line0 = (uint4)(X[0].s4, X[1].s4, X[2].s4, X[3].s4); \
	line1 = (uint4)(X[4].s4, X[5].s4, X[6].s4, X[7].s4); \
	line2 = (uint4)(X[8].s4, X[9].s4, X[10].s4, X[11].s4); \
	line3 = (uint4)(X[12].s4, X[13].s4, X[14].s4, X[15].s4); \
	line4 = (uint4)(X[16].s4, X[17].s4, X[18].s4, X[19].s4); \
	line5 = (uint4)(X[20].s4, X[21].s4, X[22].s4, X[23].s4); \
	line6 = (uint4)(X[24].s4, X[25].s4, X[26].s4, X[27].s4); \
	line7 = (uint4)(X[28].s4, X[29].s4, X[30].s4, X[31].s4);
#endif

#define 16WRITE_16s5 \
	line0 = (uint4)(X[0].s5, X[1].s5, X[2].s5, X[3].s5); \
	line1 = (uint4)(X[4].s5, X[5].s5, X[6].s5, X[7].s5); \
	line2 = (uint4)(X[8].s5, X[9].s5, X[10].s5, X[11].s5); \
	line3 = (uint4)(X[12].s5, X[13].s5, X[14].s5, X[15].s5); \
	line4 = (uint4)(X[16].s5, X[17].s5, X[18].s5, X[19].s5); \
	line5 = (uint4)(X[20].s5, X[21].s5, X[22].s5, X[23].s5); \
	line6 = (uint4)(X[24].s5, X[25].s5, X[26].s5, X[27].s5); \
	line7 = (uint4)(X[28].s5, X[29].s5, X[30].s5, X[31].s5);
#endif

#define 16WRITE_16s6 \
	line0 = (uint4)(X[0].s6, X[1].s6, X[2].s6, X[3].s6); \
	line1 = (uint4)(X[4].s6, X[5].s6, X[6].s6, X[7].s6); \
	line2 = (uint4)(X[8].s6, X[9].s6, X[10].s6, X[11].s6); \
	line3 = (uint4)(X[12].s6, X[13].s6, X[14].s6, X[15].s6); \
	line4 = (uint4)(X[16].s6, X[17].s6, X[18].s6, X[19].s6); \
	line5 = (uint4)(X[20].s6, X[21].s6, X[22].s6, X[23].s6); \
	line6 = (uint4)(X[24].s6, X[25].s6, X[26].s6, X[27].s6); \
	line7 = (uint4)(X[28].s6, X[29].s6, X[30].s6, X[31].s6);
#endif

#define 16WRITE_16s7 \
	line0 = (uint4)(X[0].s7, X[1].s7, X[2].s7, X[3].s7); \
	line1 = (uint4)(X[4].s7, X[5].s7, X[6].s7, X[7].s7); \
	line2 = (uint4)(X[8].s7, X[9].s7, X[10].s7, X[11].s7); \
	line3 = (uint4)(X[12].s7, X[13].s7, X[14].s7, X[15].s7); \
	line4 = (uint4)(X[16].s7, X[17].s7, X[18].s7, X[19].s7); \
	line5 = (uint4)(X[20].s7, X[21].s7, X[22].s7, X[23].s7); \
	line6 = (uint4)(X[24].s7, X[25].s7, X[26].s7, X[27].s7); \
	line7 = (uint4)(X[28].s7, X[29].s7, X[30].s7, X[31].s7);
#endif

#define 16WRITE_16s8 \
	line0 = (uint4)(X[0].s8, X[1].s8, X[2].s8, X[3].s8); \
	line1 = (uint4)(X[4].s8, X[5].s8, X[6].s8, X[7].s8); \
	line2 = (uint4)(X[8].s8, X[9].s8, X[10].s8, X[11].s8); \
	line3 = (uint4)(X[12].s8, X[13].s8, X[14].s8, X[15].s8); \
	line4 = (uint4)(X[16].s8, X[17].s8, X[18].s8, X[19].s8); \
	line5 = (uint4)(X[20].s8, X[21].s8, X[22].s8, X[23].s8); \
	line6 = (uint4)(X[24].s8, X[25].s8, X[26].s8, X[27].s8); \
	line7 = (uint4)(X[28].s8, X[29].s8, X[30].s8, X[31].s8);
#endif

#define 16WRITE_16s9 \
	line0 = (uint4)(X[0].s9, X[1].s9, X[2].s9, X[3].s9); \
	line1 = (uint4)(X[4].s9, X[5].s9, X[6].s9, X[7].s9); \
	line2 = (uint4)(X[8].s9, X[9].s9, X[10].s9, X[11].s9); \
	line3 = (uint4)(X[12].s9, X[13].s9, X[14].s9, X[15].s9); \
	line4 = (uint4)(X[16].s9, X[17].s9, X[18].s9, X[19].s9); \
	line5 = (uint4)(X[20].s9, X[21].s9, X[22].s9, X[23].s9); \
	line6 = (uint4)(X[24].s9, X[25].s9, X[26].s9, X[27].s9); \
	line7 = (uint4)(X[28].s9, X[29].s9, X[30].s9, X[31].s9);
#endif

#define 16WRITE_16sa \
	line0 = (uint4)(X[0].sa, X[1].sa, X[2].sa, X[3].sa); \
	line1 = (uint4)(X[4].sa, X[5].sa, X[6].sa, X[7].sa); \
	line2 = (uint4)(X[8].sa, X[9].sa, X[10].sa, X[11].sa); \
	line3 = (uint4)(X[12].sa, X[13].sa, X[14].sa, X[15].sa); \
	line4 = (uint4)(X[16].sa, X[17].sa, X[18].sa, X[19].sa); \
	line5 = (uint4)(X[20].sa, X[21].sa, X[22].sa, X[23].sa); \
	line6 = (uint4)(X[24].sa, X[25].sa, X[26].sa, X[27].sa); \
	line7 = (uint4)(X[28].sa, X[29].sa, X[30].sa, X[31].sa);
#endif

#define 16WRITE_16sb \
	line0 = (uint4)(X[0].sb, X[1].sb, X[2].sb, X[3].sb); \
	line1 = (uint4)(X[4].sb, X[5].sb, X[6].sb, X[7].sb); \
	line2 = (uint4)(X[8].sb, X[9].sb, X[10].sb, X[11].sb); \
	line3 = (uint4)(X[12].sb, X[13].sb, X[14].sb, X[15].sb); \
	line4 = (uint4)(X[16].sb, X[17].sb, X[18].sb, X[19].sb); \
	line5 = (uint4)(X[20].sb, X[21].sb, X[22].sb, X[23].sb); \
	line6 = (uint4)(X[24].sb, X[25].sb, X[26].sb, X[27].sb); \
	line7 = (uint4)(X[28].sb, X[29].sb, X[30].sb, X[31].sb);
#endif

#define 16WRITE_16sc \
	line0 = (uint4)(X[0].sc, X[1].sc, X[2].sc, X[3].sc); \
	line1 = (uint4)(X[4].sc, X[5].sc, X[6].sc, X[7].sc); \
	line2 = (uint4)(X[8].sc, X[9].sc, X[10].sc, X[11].sc); \
	line3 = (uint4)(X[12].sc, X[13].sc, X[14].sc, X[15].sc); \
	line4 = (uint4)(X[16].sc, X[17].sc, X[18].sc, X[19].sc); \
	line5 = (uint4)(X[20].sc, X[21].sc, X[22].sc, X[23].sc); \
	line6 = (uint4)(X[24].sc, X[25].sc, X[26].sc, X[27].sc); \
	line7 = (uint4)(X[28].sc, X[29].sc, X[30].sc, X[31].sc);
#endif

#define 16WRITE_16sd \
	line0 = (uint4)(X[0].sd, X[1].sd, X[2].sd, X[3].sd); \
	line1 = (uint4)(X[4].sd, X[5].sd, X[6].sd, X[7].sd); \
	line2 = (uint4)(X[8].sd, X[9].sd, X[10].sd, X[11].sd); \
	line3 = (uint4)(X[12].sd, X[13].sd, X[14].sd, X[15].sd); \
	line4 = (uint4)(X[16].sd, X[17].sd, X[18].sd, X[19].sd); \
	line5 = (uint4)(X[20].sd, X[21].sd, X[22].sd, X[23].sd); \
	line6 = (uint4)(X[24].sd, X[25].sd, X[26].sd, X[27].sd); \
	line7 = (uint4)(X[28].sd, X[29].sd, X[30].sd, X[31].sd);
#endif

#define 16WRITE_16se \
	line0 = (uint4)(X[0].se, X[1].se, X[2].se, X[3].se); \
	line1 = (uint4)(X[4].se, X[5].se, X[6].se, X[7].se); \
	line2 = (uint4)(X[8].se, X[9].se, X[10].se, X[11].se); \
	line3 = (uint4)(X[12].se, X[13].se, X[14].se, X[15].se); \
	line4 = (uint4)(X[16].se, X[17].se, X[18].se, X[19].se); \
	line5 = (uint4)(X[20].se, X[21].se, X[22].se, X[23].se); \
	line6 = (uint4)(X[24].se, X[25].se, X[26].se, X[27].se); \
	line7 = (uint4)(X[28].se, X[29].se, X[30].se, X[31].se);
#endif

#define 16WRITE_16sf \
	line0 = (uint4)(X[0].sf, X[1].sf, X[2].sf, X[3].sf); \
	line1 = (uint4)(X[4].sf, X[5].sf, X[6].sf, X[7].sf); \
	line2 = (uint4)(X[8].sf, X[9].sf, X[10].sf, X[11].sf); \
	line3 = (uint4)(X[12].sf, X[13].sf, X[14].sf, X[15].sf); \
	line4 = (uint4)(X[16].sf, X[17].sf, X[18].sf, X[19].sf); \
	line5 = (uint4)(X[20].sf, X[21].sf, X[22].sf, X[23].sf); \
	line6 = (uint4)(X[24].sf, X[25].sf, X[26].sf, X[27].sf); \
	line7 = (uint4)(X[28].sf, X[29].sf, X[30].sf, X[31].sf);
#endif

//8 WRITES
#define 8WRITE_64s0 \
	line0 = (uint16)(X[0].s0, X[1].s0, X[2].s0, X[3].s0, X[4].s0, X[5].s0, X[6].s0, X[7].s0, X[8].s0, X[9].s0, X[10].s0, X[11].s0, X[12].s0, X[13].s0, X[14].s0, X[15].s0); \
	line1 = (uint16)(X[16].s0, X[17].s0, X[18].s0, X[19].s0, X[20].s0, X[21].s0, X[22].s0, X[23].s0, X[24].s0, X[25].s0, X[26].s0, X[27].s0, X[28].s0, X[29].s0, X[30].s0, X[31].s0);
#endif

#define 8WRITE_64s1 \
	line0 = (uint16)(X[0].s1, X[1].s1, X[2].s1, X[3].s1, X[4].s1, X[5].s1, X[6].s1, X[7].s1, X[8].s1, X[9].s1, X[10].s1, X[11].s1, X[12].s1, X[13].s1, X[14].s1, X[15].s1); \
	line1 = (uint16)(X[16].s1, X[17].s1, X[18].s1, X[19].s1, X[20].s1, X[21].s1, X[22].s1, X[23].s1, X[24].s1, X[25].s1, X[26].s1, X[27].s1, X[28].s1, X[29].s1, X[30].s1, X[31].s1);
#endif

#define 8WRITE_64s2 \
	line0 = (uint16)(X[0].s2, X[1].s2, X[2].s2, X[3].s2, X[4].s2, X[5].s2, X[6].s2, X[7].s2, X[8].s2, X[9].s2, X[10].s2, X[11].s2, X[12].s2, X[13].s2, X[14].s2, X[15].s2); \
	line1 = (uint16)(X[16].s2, X[17].s2, X[18].s2, X[19].s2, X[20].s2, X[21].s2, X[22].s2, X[23].s2, X[24].s2, X[25].s2, X[26].s2, X[27].s2, X[28].s2, X[29].s2, X[30].s2, X[31].s2);
#endif

#define 8WRITE_64s3 \
	line0 = (uint16)(X[0].s3, X[1].s3, X[2].s3, X[3].s3, X[4].s3, X[5].s3, X[6].s3, X[7].s3, X[8].s3, X[9].s3, X[10].s3, X[11].s3, X[12].s3, X[13].s3, X[14].s3, X[15].s3); \
	line1 = (uint16)(X[16].s3, X[17].s3, X[18].s3, X[19].s3, X[20].s3, X[21].s3, X[22].s3, X[23].s3, X[24].s3, X[25].s3, X[26].s3, X[27].s3, X[28].s3, X[29].s3, X[30].s3, X[31].s3);
#endif

#define 8WRITE_64s4 \
	line0 = (uint16)(X[0].s4, X[1].s4, X[2].s4, X[3].s4, X[4].s4, X[5].s4, X[6].s4, X[7].s4, X[8].s4, X[9].s4, X[10].s4, X[11].s4, X[12].s4, X[13].s4, X[14].s4, X[15].s4); \
	line1 = (uint16)(X[16].s4, X[17].s4, X[18].s4, X[19].s4, X[20].s4, X[21].s4, X[22].s4, X[23].s4, X[24].s4, X[25].s4, X[26].s4, X[27].s4, X[28].s4, X[29].s4, X[30].s4, X[31].s4);
#endif

#define 8WRITE_64s5 \
	line0 = (uint16)(X[0].s5, X[1].s5, X[2].s5, X[3].s5, X[4].s5, X[5].s5, X[6].s5, X[7].s5, X[8].s5, X[9].s5, X[10].s5, X[11].s5, X[12].s5, X[13].s5, X[14].s5, X[15].s5); \
	line1 = (uint16)(X[16].s5, X[17].s5, X[18].s5, X[19].s5, X[20].s5, X[21].s5, X[22].s5, X[23].s5, X[24].s5, X[25].s5, X[26].s5, X[27].s5, X[28].s5, X[29].s5, X[30].s5, X[31].s5);
#endif

#define 8WRITE_64s6 \
	line0 = (uint16)(X[0].s6, X[1].s6, X[2].s6, X[3].s6, X[4].s6, X[5].s6, X[6].s6, X[7].s6, X[8].s6, X[9].s6, X[10].s6, X[11].s6, X[12].s6, X[13].s6, X[14].s6, X[15].s6); \
	line1 = (uint16)(X[16].s6, X[17].s6, X[18].s6, X[19].s6, X[20].s6, X[21].s6, X[22].s6, X[23].s6, X[24].s6, X[25].s6, X[26].s6, X[27].s6, X[28].s6, X[29].s6, X[30].s6, X[31].s6);
#endif

#define 8WRITE_64s7 \
	line0 = (uint16)(X[0].s7, X[1].s7, X[2].s7, X[3].s7, X[4].s7, X[5].s7, X[6].s7, X[7].s7, X[8].s7, X[9].s7, X[10].s7, X[11].s7, X[12].s7, X[13].s7, X[14].s7, X[15].s7); \
	line1 = (uint16)(X[16].s7, X[17].s7, X[18].s7, X[19].s7, X[20].s7, X[21].s7, X[22].s7, X[23].s7, X[24].s7, X[25].s7, X[26].s7, X[27].s7, X[28].s7, X[29].s7, X[30].s7, X[31].s7);
#endif

#define 8WRITE_32s0 \
	line0 = (uint8)(X[0].s0, X[1].s0, X[2].s0, X[3].s0, X[4].s0, X[5].s0, X[6].s0, X[7].s0); \
	line1 = (uint8)(X[8].s0, X[9].s0, X[10].s0, X[11].s0, X[12].s0, X[13].s0, X[14].s0, X[15].s0); \
	line2 = (uint8)(X[16].s0, X[17].s0, X[18].s0, X[19].s0, X[20].s0, X[21].s0, X[22].s0, X[23].s0); \
	line3 = (uint8)(X[24].s0, X[25].s0, X[26].s0, X[27].s0, X[28].s0, X[29].s0, X[30].s0, X[31].s0);
#endif

#define 8WRITE_32s1 \
	line0 = (uint8)(X[0].s1, X[1].s1, X[2].s1, X[3].s1, X[4].s1, X[5].s1, X[6].s1, X[7].s1); \
	line1 = (uint8)(X[8].s1, X[9].s1, X[10].s1, X[11].s1, X[12].s1, X[13].s1, X[14].s1, X[15].s1); \
	line2 = (uint8)(X[16].s1, X[17].s1, X[18].s1, X[19].s1, X[20].s1, X[21].s1, X[22].s1, X[23].s1); \
	line3 = (uint8)(X[24].s1, X[25].s1, X[26].s1, X[27].s1, X[28].s1, X[29].s1, X[30].s1, X[31].s1);
#endif

#define 8WRITE_32s2 \
	line0 = (uint8)(X[0].s2, X[1].s2, X[2].s2, X[3].s2, X[4].s2, X[5].s2, X[6].s2, X[7].s2); \
	line1 = (uint8)(X[8].s2, X[9].s2, X[10].s2, X[11].s2, X[12].s2, X[13].s2, X[14].s2, X[15].s2); \
	line2 = (uint8)(X[16].s2, X[17].s2, X[18].s2, X[19].s2, X[20].s2, X[21].s2, X[22].s2, X[23].s2); \
	line3 = (uint8)(X[24].s2, X[25].s2, X[26].s2, X[27].s2, X[28].s2, X[29].s2, X[30].s2, X[31].s2);
#endif

#define 8WRITE_32s3 \
	line0 = (uint8)(X[0].s3, X[1].s3, X[2].s3, X[3].s3, X[4].s3, X[5].s3, X[6].s3, X[7].s3); \
	line1 = (uint8)(X[8].s3, X[9].s3, X[10].s3, X[11].s3, X[12].s3, X[13].s3, X[14].s3, X[15].s3); \
	line2 = (uint8)(X[16].s3, X[17].s3, X[18].s3, X[19].s3, X[20].s3, X[21].s3, X[22].s3, X[23].s3); \
	line3 = (uint8)(X[24].s3, X[25].s3, X[26].s3, X[27].s3, X[28].s3, X[29].s3, X[30].s3, X[31].s3);
#endif

#define 8WRITE_32s4 \
	line0 = (uint8)(X[0].s4, X[1].s4, X[2].s4, X[3].s4, X[4].s4, X[5].s4, X[6].s4, X[7].s4); \
	line1 = (uint8)(X[8].s4, X[9].s4, X[10].s4, X[11].s4, X[12].s4, X[13].s4, X[14].s4, X[15].s4); \
	line2 = (uint8)(X[16].s4, X[17].s4, X[18].s4, X[19].s4, X[20].s4, X[21].s4, X[22].s4, X[23].s4); \
	line3 = (uint8)(X[24].s4, X[25].s4, X[26].s4, X[27].s4, X[28].s4, X[29].s4, X[30].s4, X[31].s4);
#endif

#define 8WRITE_32s5 \
	line0 = (uint8)(X[0].s5, X[1].s5, X[2].s5, X[3].s5, X[4].s5, X[5].s5, X[6].s5, X[7].s5); \
	line1 = (uint8)(X[8].s5, X[9].s5, X[10].s5, X[11].s5, X[12].s5, X[13].s5, X[14].s5, X[15].s5); \
	line2 = (uint8)(X[16].s5, X[17].s5, X[18].s5, X[19].s5, X[20].s5, X[21].s5, X[22].s5, X[23].s5); \
	line3 = (uint8)(X[24].s5, X[25].s5, X[26].s5, X[27].s5, X[28].s5, X[29].s5, X[30].s5, X[31].s5);
#endif

#define 8WRITE_32s6 \
	line0 = (uint8)(X[0].s6, X[1].s6, X[2].s6, X[3].s6, X[4].s6, X[5].s6, X[6].s6, X[7].s6); \
	line1 = (uint8)(X[8].s6, X[9].s6, X[10].s6, X[11].s6, X[12].s6, X[13].s6, X[14].s6, X[15].s6); \
	line2 = (uint8)(X[16].s6, X[17].s6, X[18].s6, X[19].s6, X[20].s6, X[21].s6, X[22].s6, X[23].s6); \
	line3 = (uint8)(X[24].s6, X[25].s6, X[26].s6, X[27].s6, X[28].s6, X[29].s6, X[30].s6, X[31].s6);
#endif

#define 8WRITE_32s7 \
	line0 = (uint8)(X[0].s7, X[1].s7, X[2].s7, X[3].s7, X[4].s7, X[5].s7, X[6].s7, X[7].s7); \
	line1 = (uint8)(X[8].s7, X[9].s7, X[10].s7, X[11].s7, X[12].s7, X[13].s7, X[14].s7, X[15].s7); \
	line2 = (uint8)(X[16].s7, X[17].s7, X[18].s7, X[19].s7, X[20].s7, X[21].s7, X[22].s7, X[23].s7); \
	line3 = (uint8)(X[24].s7, X[25].s7, X[26].s7, X[27].s7, X[28].s7, X[29].s7, X[30].s7, X[31].s7);
#endif

#define 8WRITE_16s0 \
	line0 = (uint4)(X[0].s0, X[1].s0, X[2].s0, X[3].s0); \
	line1 = (uint4)(X[4].s0, X[5].s0, X[6].s0, X[7].s0); \
	line2 = (uint4)(X[8].s0, X[9].s0, X[10].s0, X[11].s0); \
	line3 = (uint4)(X[12].s0, X[13].s0, X[14].s0, X[15].s0); \
	line4 = (uint4)(X[16].s0, X[17].s0, X[18].s0, X[19].s0); \
	line5 = (uint4)(X[20].s0, X[21].s0, X[22].s0, X[23].s0); \
	line6 = (uint4)(X[24].s0, X[25].s0, X[26].s0, X[27].s0); \
	line7 = (uint4)(X[28].s0, X[29].s0, X[30].s0, X[31].s0);
#endif

#define 8WRITE_16s1 \
	line0 = (uint4)(X[0].s1, X[1].s1, X[2].s1, X[3].s1); \
	line1 = (uint4)(X[4].s1, X[5].s1, X[6].s1, X[7].s1); \
	line2 = (uint4)(X[8].s1, X[9].s1, X[10].s1, X[11].s1); \
	line3 = (uint4)(X[12].s1, X[13].s1, X[14].s1, X[15].s1); \
	line4 = (uint4)(X[16].s1, X[17].s1, X[18].s1, X[19].s1); \
	line5 = (uint4)(X[20].s1, X[21].s1, X[22].s1, X[23].s1); \
	line6 = (uint4)(X[24].s1, X[25].s1, X[26].s1, X[27].s1); \
	line7 = (uint4)(X[28].s1, X[29].s1, X[30].s1, X[31].s1);
#endif

#define 8WRITE_16s2 \
	line0 = (uint4)(X[0].s2, X[1].s2, X[2].s2, X[3].s2); \
	line1 = (uint4)(X[4].s2, X[5].s2, X[6].s2, X[7].s2); \
	line2 = (uint4)(X[8].s2, X[9].s2, X[10].s2, X[11].s2); \
	line3 = (uint4)(X[12].s2, X[13].s2, X[14].s2, X[15].s2); \
	line4 = (uint4)(X[16].s2, X[17].s2, X[18].s2, X[19].s2); \
	line5 = (uint4)(X[20].s2, X[21].s2, X[22].s2, X[23].s2); \
	line6 = (uint4)(X[24].s2, X[25].s2, X[26].s2, X[27].s2); \
	line7 = (uint4)(X[28].s2, X[29].s2, X[30].s2, X[31].s2);
#endif

#define 8WRITE_16s3 \
	line0 = (uint4)(X[0].s3, X[1].s3, X[2].s3, X[3].s3); \
	line1 = (uint4)(X[4].s3, X[5].s3, X[6].s3, X[7].s3); \
	line2 = (uint4)(X[8].s3, X[9].s3, X[10].s3, X[11].s3); \
	line3 = (uint4)(X[12].s3, X[13].s3, X[14].s3, X[15].s3); \
	line4 = (uint4)(X[16].s3, X[17].s3, X[18].s3, X[19].s3); \
	line5 = (uint4)(X[20].s3, X[21].s3, X[22].s3, X[23].s3); \
	line6 = (uint4)(X[24].s3, X[25].s3, X[26].s3, X[27].s3); \
	line7 = (uint4)(X[28].s3, X[29].s3, X[30].s3, X[31].s3);
#endif

#define 8WRITE_16s4 \
	line0 = (uint4)(X[0].s4, X[1].s4, X[2].s4, X[3].s4); \
	line1 = (uint4)(X[4].s4, X[5].s4, X[6].s4, X[7].s4); \
	line2 = (uint4)(X[8].s4, X[9].s4, X[10].s4, X[11].s4); \
	line3 = (uint4)(X[12].s4, X[13].s4, X[14].s4, X[15].s4); \
	line4 = (uint4)(X[16].s4, X[17].s4, X[18].s4, X[19].s4); \
	line5 = (uint4)(X[20].s4, X[21].s4, X[22].s4, X[23].s4); \
	line6 = (uint4)(X[24].s4, X[25].s4, X[26].s4, X[27].s4); \
	line7 = (uint4)(X[28].s4, X[29].s4, X[30].s4, X[31].s4);
#endif

#define 8WRITE_16s5 \
	line0 = (uint4)(X[0].s5, X[1].s5, X[2].s5, X[3].s5); \
	line1 = (uint4)(X[4].s5, X[5].s5, X[6].s5, X[7].s5); \
	line2 = (uint4)(X[8].s5, X[9].s5, X[10].s5, X[11].s5); \
	line3 = (uint4)(X[12].s5, X[13].s5, X[14].s5, X[15].s5); \
	line4 = (uint4)(X[16].s5, X[17].s5, X[18].s5, X[19].s5); \
	line5 = (uint4)(X[20].s5, X[21].s5, X[22].s5, X[23].s5); \
	line6 = (uint4)(X[24].s5, X[25].s5, X[26].s5, X[27].s5); \
	line7 = (uint4)(X[28].s5, X[29].s5, X[30].s5, X[31].s5);
#endif

#define 8WRITE_16s6 \
	line0 = (uint4)(X[0].s6, X[1].s6, X[2].s6, X[3].s6); \
	line1 = (uint4)(X[4].s6, X[5].s6, X[6].s6, X[7].s6); \
	line2 = (uint4)(X[8].s6, X[9].s6, X[10].s6, X[11].s6); \
	line3 = (uint4)(X[12].s6, X[13].s6, X[14].s6, X[15].s6); \
	line4 = (uint4)(X[16].s6, X[17].s6, X[18].s6, X[19].s6); \
	line5 = (uint4)(X[20].s6, X[21].s6, X[22].s6, X[23].s6); \
	line6 = (uint4)(X[24].s6, X[25].s6, X[26].s6, X[27].s6); \
	line7 = (uint4)(X[28].s6, X[29].s6, X[30].s6, X[31].s6);
#endif

#define 8WRITE_16s7 \
	line0 = (uint4)(X[0].s7, X[1].s7, X[2].s7, X[3].s7); \
	line1 = (uint4)(X[4].s7, X[5].s7, X[6].s7, X[7].s7); \
	line2 = (uint4)(X[8].s7, X[9].s7, X[10].s7, X[11].s7); \
	line3 = (uint4)(X[12].s7, X[13].s7, X[14].s7, X[15].s7); \
	line4 = (uint4)(X[16].s7, X[17].s7, X[18].s7, X[19].s7); \
	line5 = (uint4)(X[20].s7, X[21].s7, X[22].s7, X[23].s7); \
	line6 = (uint4)(X[24].s7, X[25].s7, X[26].s7, X[27].s7); \
	line7 = (uint4)(X[28].s7, X[29].s7, X[30].s7, X[31].s7);
#endif

//4 WRITES
#define 4WRITE_64s0 \
	line0 = (uint16)(X[0].s0, X[1].s0, X[2].s0, X[3].s0, X[4].s0, X[5].s0, X[6].s0, X[7].s0, X[8].s0, X[9].s0, X[10].s0, X[11].s0, X[12].s0, X[13].s0, X[14].s0, X[15].s0); \
	line1 = (uint16)(X[16].s0, X[17].s0, X[18].s0, X[19].s0, X[20].s0, X[21].s0, X[22].s0, X[23].s0, X[24].s0, X[25].s0, X[26].s0, X[27].s0, X[28].s0, X[29].s0, X[30].s0, X[31].s0);
#endif

#define 4WRITE_64s1 \
	line0 = (uint16)(X[0].s1, X[1].s1, X[2].s1, X[3].s1, X[4].s1, X[5].s1, X[6].s1, X[7].s1, X[8].s1, X[9].s1, X[10].s1, X[11].s1, X[12].s1, X[13].s1, X[14].s1, X[15].s1); \
	line1 = (uint16)(X[16].s1, X[17].s1, X[18].s1, X[19].s1, X[20].s1, X[21].s1, X[22].s1, X[23].s1, X[24].s1, X[25].s1, X[26].s1, X[27].s1, X[28].s1, X[29].s1, X[30].s1, X[31].s1);
#endif

#define 4WRITE_64s2 \
	line0 = (uint16)(X[0].s2, X[1].s2, X[2].s2, X[3].s2, X[4].s2, X[5].s2, X[6].s2, X[7].s2, X[8].s2, X[9].s2, X[10].s2, X[11].s2, X[12].s2, X[13].s2, X[14].s2, X[15].s2); \
	line1 = (uint16)(X[16].s2, X[17].s2, X[18].s2, X[19].s2, X[20].s2, X[21].s2, X[22].s2, X[23].s2, X[24].s2, X[25].s2, X[26].s2, X[27].s2, X[28].s2, X[29].s2, X[30].s2, X[31].s2);
#endif

#define 4WRITE_64s3 \
	line0 = (uint16)(X[0].s3, X[1].s3, X[2].s3, X[3].s3, X[4].s3, X[5].s3, X[6].s3, X[7].s3, X[8].s3, X[9].s3, X[10].s3, X[11].s3, X[12].s3, X[13].s3, X[14].s3, X[15].s3); \
	line1 = (uint16)(X[16].s3, X[17].s3, X[18].s3, X[19].s3, X[20].s3, X[21].s3, X[22].s3, X[23].s3, X[24].s3, X[25].s3, X[26].s3, X[27].s3, X[28].s3, X[29].s3, X[30].s3, X[31].s3);
#endif

#define 4WRITE_32s0 \
	line0 = (uint8)(X[0].s0, X[1].s0, X[2].s0, X[3].s0, X[4].s0, X[5].s0, X[6].s0, X[7].s0); \
	line1 = (uint8)(X[8].s0, X[9].s0, X[10].s0, X[11].s0, X[12].s0, X[13].s0, X[14].s0, X[15].s0); \
	line2 = (uint8)(X[16].s0, X[17].s0, X[18].s0, X[19].s0, X[20].s0, X[21].s0, X[22].s0, X[23].s0); \
	line3 = (uint8)(X[24].s0, X[25].s0, X[26].s0, X[27].s0, X[28].s0, X[29].s0, X[30].s0, X[31].s0);
#endif

#define 4WRITE_32s1 \
	line0 = (uint8)(X[0].s1, X[1].s1, X[2].s1, X[3].s1, X[4].s1, X[5].s1, X[6].s1, X[7].s1); \
	line1 = (uint8)(X[8].s1, X[9].s1, X[10].s1, X[11].s1, X[12].s1, X[13].s1, X[14].s1, X[15].s1); \
	line2 = (uint8)(X[16].s1, X[17].s1, X[18].s1, X[19].s1, X[20].s1, X[21].s1, X[22].s1, X[23].s1); \
	line3 = (uint8)(X[24].s1, X[25].s1, X[26].s1, X[27].s1, X[28].s1, X[29].s1, X[30].s1, X[31].s1);
#endif

#define 4WRITE_32s2 \
	line0 = (uint8)(X[0].s2, X[1].s2, X[2].s2, X[3].s2, X[4].s2, X[5].s2, X[6].s2, X[7].s2); \
	line1 = (uint8)(X[8].s2, X[9].s2, X[10].s2, X[11].s2, X[12].s2, X[13].s2, X[14].s2, X[15].s2); \
	line2 = (uint8)(X[16].s2, X[17].s2, X[18].s2, X[19].s2, X[20].s2, X[21].s2, X[22].s2, X[23].s2); \
	line3 = (uint8)(X[24].s2, X[25].s2, X[26].s2, X[27].s2, X[28].s2, X[29].s2, X[30].s2, X[31].s2);
#endif

#define 4WRITE_32s3 \
	line0 = (uint8)(X[0].s3, X[1].s3, X[2].s3, X[3].s3, X[4].s3, X[5].s3, X[6].s3, X[7].s3); \
	line1 = (uint8)(X[8].s3, X[9].s3, X[10].s3, X[11].s3, X[12].s3, X[13].s3, X[14].s3, X[15].s3); \
	line2 = (uint8)(X[16].s3, X[17].s3, X[18].s3, X[19].s3, X[20].s3, X[21].s3, X[22].s3, X[23].s3); \
	line3 = (uint8)(X[24].s3, X[25].s3, X[26].s3, X[27].s3, X[28].s3, X[29].s3, X[30].s3, X[31].s3);
#endif

#define 4WRITE_16s0 \
	line0 = (uint4)(X[0].s0, X[1].s0, X[2].s0, X[3].s0); \
	line1 = (uint4)(X[4].s0, X[5].s0, X[6].s0, X[7].s0); \
	line2 = (uint4)(X[8].s0, X[9].s0, X[10].s0, X[11].s0); \
	line3 = (uint4)(X[12].s0, X[13].s0, X[14].s0, X[15].s0); \
	line4 = (uint4)(X[16].s0, X[17].s0, X[18].s0, X[19].s0); \
	line5 = (uint4)(X[20].s0, X[21].s0, X[22].s0, X[23].s0); \
	line6 = (uint4)(X[24].s0, X[25].s0, X[26].s0, X[27].s0); \
	line7 = (uint4)(X[28].s0, X[29].s0, X[30].s0, X[31].s0);
#endif

#define 4WRITE_16s1 \
	line0 = (uint4)(X[0].s1, X[1].s1, X[2].s1, X[3].s1); \
	line1 = (uint4)(X[4].s1, X[5].s1, X[6].s1, X[7].s1); \
	line2 = (uint4)(X[8].s1, X[9].s1, X[10].s1, X[11].s1); \
	line3 = (uint4)(X[12].s1, X[13].s1, X[14].s1, X[15].s1); \
	line4 = (uint4)(X[16].s1, X[17].s1, X[18].s1, X[19].s1); \
	line5 = (uint4)(X[20].s1, X[21].s1, X[22].s1, X[23].s1); \
	line6 = (uint4)(X[24].s1, X[25].s1, X[26].s1, X[27].s1); \
	line7 = (uint4)(X[28].s1, X[29].s1, X[30].s1, X[31].s1);
#endif

#define 4WRITE_16s2 \
	line0 = (uint4)(X[0].s2, X[1].s2, X[2].s2, X[3].s2); \
	line1 = (uint4)(X[4].s2, X[5].s2, X[6].s2, X[7].s2); \
	line2 = (uint4)(X[8].s2, X[9].s2, X[10].s2, X[11].s2); \
	line3 = (uint4)(X[12].s2, X[13].s2, X[14].s2, X[15].s2); \
	line4 = (uint4)(X[16].s2, X[17].s2, X[18].s2, X[19].s2); \
	line5 = (uint4)(X[20].s2, X[21].s2, X[22].s2, X[23].s2); \
	line6 = (uint4)(X[24].s2, X[25].s2, X[26].s2, X[27].s2); \
	line7 = (uint4)(X[28].s2, X[29].s2, X[30].s2, X[31].s2);
#endif

#define 4WRITE_16s3 \
	line0 = (uint4)(X[0].s3, X[1].s3, X[2].s3, X[3].s3); \
	line1 = (uint4)(X[4].s3, X[5].s3, X[6].s3, X[7].s3); \
	line2 = (uint4)(X[8].s3, X[9].s3, X[10].s3, X[11].s3); \
	line3 = (uint4)(X[12].s3, X[13].s3, X[14].s3, X[15].s3); \
	line4 = (uint4)(X[16].s3, X[17].s3, X[18].s3, X[19].s3); \
	line5 = (uint4)(X[20].s3, X[21].s3, X[22].s3, X[23].s3); \
	line6 = (uint4)(X[24].s3, X[25].s3, X[26].s3, X[27].s3); \
	line7 = (uint4)(X[28].s3, X[29].s3, X[30].s3, X[31].s3);
#endif

//2 WRITES
#define 2WRITE_64s0 \
	line0 = (uint16)(X[0].s0, X[1].s0, X[2].s0, X[3].s0, X[4].s0, X[5].s0, X[6].s0, X[7].s0, X[8].s0, X[9].s0, X[10].s0, X[11].s0, X[12].s0, X[13].s0, X[14].s0, X[15].s0); \
	line1 = (uint16)(X[16].s0, X[17].s0, X[18].s0, X[19].s0, X[20].s0, X[21].s0, X[22].s0, X[23].s0, X[24].s0, X[25].s0, X[26].s0, X[27].s0, X[28].s0, X[29].s0, X[30].s0, X[31].s0);
#endif

#define 2WRITE_64s1 \
	line0 = (uint16)(X[0].s1, X[1].s1, X[2].s1, X[3].s1, X[4].s1, X[5].s1, X[6].s1, X[7].s1, X[8].s1, X[9].s1, X[10].s1, X[11].s1, X[12].s1, X[13].s1, X[14].s1, X[15].s1); \
	line1 = (uint16)(X[16].s1, X[17].s1, X[18].s1, X[19].s1, X[20].s1, X[21].s1, X[22].s1, X[23].s1, X[24].s1, X[25].s1, X[26].s1, X[27].s1, X[28].s1, X[29].s1, X[30].s1, X[31].s1);
#endif

#define 2WRITE_32s0 \
	line0 = (uint8)(X[0].s0, X[1].s0, X[2].s0, X[3].s0, X[4].s0, X[5].s0, X[6].s0, X[7].s0); \
	line1 = (uint8)(X[8].s0, X[9].s0, X[10].s0, X[11].s0, X[12].s0, X[13].s0, X[14].s0, X[15].s0); \
	line2 = (uint8)(X[16].s0, X[17].s0, X[18].s0, X[19].s0, X[20].s0, X[21].s0, X[22].s0, X[23].s0); \
	line3 = (uint8)(X[24].s0, X[25].s0, X[26].s0, X[27].s0, X[28].s0, X[29].s0, X[30].s0, X[31].s0);
#endif

#define 2WRITE_32s1 \
	line0 = (uint8)(X[0].s1, X[1].s1, X[2].s1, X[3].s1, X[4].s1, X[5].s1, X[6].s1, X[7].s1); \
	line1 = (uint8)(X[8].s1, X[9].s1, X[10].s1, X[11].s1, X[12].s1, X[13].s1, X[14].s1, X[15].s1); \
	line2 = (uint8)(X[16].s1, X[17].s1, X[18].s1, X[19].s1, X[20].s1, X[21].s1, X[22].s1, X[23].s1); \
	line3 = (uint8)(X[24].s1, X[25].s1, X[26].s1, X[27].s1, X[28].s1, X[29].s1, X[30].s1, X[31].s1);
#endif

#define 2WRITE_16s0 \
	line0 = (uint4)(X[0].s0, X[1].s0, X[2].s0, X[3].s0); \
	line1 = (uint4)(X[4].s0, X[5].s0, X[6].s0, X[7].s0); \
	line2 = (uint4)(X[8].s0, X[9].s0, X[10].s0, X[11].s0); \
	line3 = (uint4)(X[12].s0, X[13].s0, X[14].s0, X[15].s0); \
	line4 = (uint4)(X[16].s0, X[17].s0, X[18].s0, X[19].s0); \
	line5 = (uint4)(X[20].s0, X[21].s0, X[22].s0, X[23].s0); \
	line6 = (uint4)(X[24].s0, X[25].s0, X[26].s0, X[27].s0); \
	line7 = (uint4)(X[28].s0, X[29].s0, X[30].s0, X[31].s0);
#endif

#define 2WRITE_16s1 \
	line0 = (uint4)(X[0].s1, X[1].s1, X[2].s1, X[3].s1); \
	line1 = (uint4)(X[4].s1, X[5].s1, X[6].s1, X[7].s1); \
	line2 = (uint4)(X[8].s1, X[9].s1, X[10].s1, X[11].s1); \
	line3 = (uint4)(X[12].s1, X[13].s1, X[14].s1, X[15].s1); \
	line4 = (uint4)(X[16].s1, X[17].s1, X[18].s1, X[19].s1); \
	line5 = (uint4)(X[20].s1, X[21].s1, X[22].s1, X[23].s1); \
	line6 = (uint4)(X[24].s1, X[25].s1, X[26].s1, X[27].s1); \
	line7 = (uint4)(X[28].s1, X[29].s1, X[30].s1, X[31].s1);
#endif

//1 WRITES
#define 1WRITE_64s2 \
	line0 = (uint16)(X[0].s2, X[1].s2, X[2].s2, X[3].s2, X[4].s2, X[5].s2, X[6].s2, X[7].s2, X[8].s2, X[9].s2, X[10].s2, X[11].s2, X[12].s2, X[13].s2, X[14].s2, X[15].s2); \
	line1 = (uint16)(X[16].s2, X[17].s2, X[18].s2, X[19].s2, X[20].s2, X[21].s2, X[22].s2, X[23].s2, X[24].s2, X[25].s2, X[26].s2, X[27].s2, X[28].s2, X[29].s2, X[30].s2, X[31].s2);
#endif

#define 1WRITE_32s2 \
	line0 = (uint8)(X[0].s2, X[1].s2, X[2].s2, X[3].s2, X[4].s2, X[5].s2, X[6].s2, X[7].s2); \
	line1 = (uint8)(X[8].s2, X[9].s2, X[10].s2, X[11].s2, X[12].s2, X[13].s2, X[14].s2, X[15].s2); \
	line2 = (uint8)(X[16].s2, X[17].s2, X[18].s2, X[19].s2, X[20].s2, X[21].s2, X[22].s2, X[23].s2); \
	line3 = (uint8)(X[24].s2, X[25].s2, X[26].s2, X[27].s2, X[28].s2, X[29].s2, X[30].s2, X[31].s2);
#endif

#define 1WRITE_16s2 \
	line0 = (uint4)(X[0].s2, X[1].s2, X[2].s2, X[3].s2); \
	line1 = (uint4)(X[4].s2, X[5].s2, X[6].s2, X[7].s2); \
	line2 = (uint4)(X[8].s2, X[9].s2, X[10].s2, X[11].s2); \
	line3 = (uint4)(X[12].s2, X[13].s2, X[14].s2, X[15].s2); \
	line4 = (uint4)(X[16].s2, X[17].s2, X[18].s2, X[19].s2); \
	line5 = (uint4)(X[20].s2, X[21].s2, X[22].s2, X[23].s2); \
	line6 = (uint4)(X[24].s2, X[25].s2, X[26].s2, X[27].s2); \
	line7 = (uint4)(X[28].s2, X[29].s2, X[30].s2, X[31].s2);
#endif

#endif