#include "Mem.h"


/*  Warning !!!
	This is not a memcpy replacement, it will truncate
*/

// rdi rsi rdx

//asm ("movl %%eax, %0\n" :"=r"(myval));

void memcpy_sse(void *dest, const void *source, size_t count){
	//asm("mov %0, %%r8\n"
	//      :
	//      : "r"(dest) , "r"(source) , "r"(count)
    //      : "r8"
    //  );

	__asm__ volatile(
		"mov %0, %%r8\n"
		"mov %1, %%r9\n"
		"mov %2, %%rdx\n"

		//"mov %0, %%eax\n"
		//"mov %1, %%ebx\n"
		//"mov %2, %%edx\n"

		//"mov 7, %r11\n"

		"shr $8, %%rdx\n"
		//"shr $8, %%edx\n"
		//"mov %r10, %ecx\n"
		//"mov $0, %%r10\n"
//		"mov $128, %%r11\n"
//mov ecx,
		"MYMEMSSE:\n"
			"prefetchnta 256(%%r9)\n"
			//Copy from source
			//"prefetchnta (%%r9)\n"
			//"prefetchnta 64(%%r9)\n"
			//"prefetchnta 128(%%r9)\n"
			//"prefetchnta 192(%%r9)\n"

/*
			"prefetchnta 256(%%r9)\n"
			"prefetchnta 80(%%r9)\n"
			"prefetchnta 112(%%r9)\n"
			"prefetchnta 128(%%r9)\n"
			"prefetchnta 144(%%r9)\n"
			"prefetchnta 160(%%r9)\n"
			"prefetchnta 176(%%r9)\n"
			"prefetchnta 192(%%r9)\n"
			"prefetchnta 208(%%r9)\n"
			"prefetchnta 224(%%r9)\n"
			"prefetchnta 240(%%r9)\n"
			//"prefetchnta (%%r9)\n"

*/
/*
			"movlps (%%r9), (%%r8)\n"
"movhps 8(%%r9),
			"movlps 16(%%r9), 16(%%r8)\n"
"movhps 24(%%r9), 
			"movlps 32(%%r9), 32(%%r8)\n"
"movhps 40(%%r9), 
			"movlps 48(%%r9), 48(%%r8)\n"
"movhps 56(%%r9), 
			"movlps 64(%%r9), 64(%%r8)\n"
"movhps 72(%%r9), 
			"movlps 80(%%r9), 80(%%r8)\n"
"movhps 88(%%r9), 
			"movlps 96(%%r9), 96(%%r8)\n"
"movhps 104(%%r9), 
			"movlps 112(%%r9), 112(%%r8)\n"
"movhps 120(%%r9), 
*/

//MOVNTPS
/*
			"movlps (%%r9), %%xmm0\n"
			"movhps 8(%%r9), %%xmm0\n"
			"movlps 16(%%r9), %%xmm1\n"
			"movhps 24(%%r9), %%xmm1\n"
			"movlps 32(%%r9), %%xmm2\n"
			"movhps 40(%%r9), %%xmm2\n"
			"movlps 48(%%r9), %%xmm3\n"
			"movhps 56(%%r9), %%xmm3\n"
			"movlps 64(%%r9), %%xmm4\n"
			"movhps 72(%%r9), %%xmm4\n"
			"movlps 80(%%r9), %%xmm5\n"
			"movhps 88(%%r9), %%xmm5\n"
			"movlps 96(%%r9), %%xmm6\n"
			"movhps 104(%%r9), %%xmm6\n"
			"movlps 112(%%r9), %%xmm7\n"
			"movhps 120(%%r9), %%xmm7\n"
			"movlps 128(%%r9), %%xmm8\n"
			"movhps 136(%%r9), %%xmm8\n"
			"movlps 144(%%r9), %%xmm9\n"
			"movhps 152(%%r9), %%xmm9\n"
			"movlps 160(%%r9), %%xmm10\n"
			"movhps 168(%%r9), %%xmm10\n"
			"movlps 176(%%r9), %%xmm11\n"
			"movhps 184(%%r9), %%xmm11\n"
			"movlps 192(%%r9), %%xmm12\n"
			"movhps 200(%%r9), %%xmm12\n"
			"movlps 208(%%r9), %%xmm13\n"
			"movhps 216(%%r9), %%xmm13\n"
			"movlps 224(%%r9), %%xmm14\n"
			"movhps 232(%%r9), %%xmm14\n"
			"movlps 240(%%r9), %%xmm15\n"
			"movhps 248(%%r9), %%xmm15\n"
*/

			"movaps (%%r9), %%xmm0\n"
			"movaps 16(%%r9), %%xmm1\n"
			"movaps 32(%%r9), %%xmm2\n"
			"movaps 48(%%r9), %%xmm3\n"
			"movaps 64(%%r9), %%xmm4\n"
			"movaps 80(%%r9), %%xmm5\n"
			"movaps 96(%%r9), %%xmm6\n"
			"movaps 112(%%r9), %%xmm7\n"
			"movaps 128(%%r9), %%xmm8\n"
			"movaps 144(%%r9), %%xmm9\n"
			"movaps 160(%%r9), %%xmm10\n"
			"movaps 176(%%r9), %%xmm11\n"
			"movaps 192(%%r9), %%xmm12\n"
			"movaps 208(%%r9), %%xmm13\n"
			"movaps 224(%%r9), %%xmm14\n"
			"movaps 240(%%r9), %%xmm15\n"

			//write to dest
			"movntps %%xmm0, (%%r8)\n"
			"movntps %%xmm1, 16(%%r8)\n"
			"movntps %%xmm2, 32(%%r8)\n"
			"movntps %%xmm3, 48(%%r8)\n"
			"movntps %%xmm4, 64(%%r8)\n"
			"movntps %%xmm5, 80(%%r8)\n"
			"movntps %%xmm6, 96(%%r8)\n"
			"movntps %%xmm7, 112(%%r8)\n"
			"movntps %%xmm8, 128(%%r8)\n"
			"movntps %%xmm9, 144(%%r8)\n"
			"movntps %%xmm10, 160(%%r8)\n"
			"movntps %%xmm11, 176(%%r8)\n"
			"movntps %%xmm12, 192(%%r8)\n"
			"movntps %%xmm13, 208(%%r8)\n"
			"movntps %%xmm14, 224(%%r8)\n"
			"movntps %%xmm15, 240(%%r8)\n"

			"add $256, %%r8\n"
			"add $256, %%r9\n"
		//	"dec %r10\n"
		//"cmp $0, %%r10\n"
        "dec     %%rdx\n"
        "jnz     MYMEMSSE\n"

		//"loop MYMEMSSE\n"

		:
		: "r"(dest) , "r"(source) , "r"(count)
		: "memory", "cc", "r8", "r9", "rdx"
	);

}
