#include <stdio.h>
#include <stdlib.h>
#include <bitset>
//#include <mmintrin.h>
#include <xmmintrin.h>


//#include <iostream>
//#include <sstream>
//#include <string>


//unsigned char c = 3;  // 011
//bitset<3> bits;

//typedef std::bitset <1> bits_t;

//std::bitset<2*N> b = (std::bitset<2*N>(high) << N) | std::bitset<2*N>(low);

//std::bitset <1> a = 1;


// Note: set() can take two parameters -- the second being the boolean value
//bits.set( 0, c & ( 1 << 2 ) );
//bits.set( 1, c & ( 1 << 1 ) );
//bits.set( 2, c & ( 1 << 0 ) );



struct one_bit{
        //unsigned c : 1;
	//boolean c;
	std::bitset<1> c;
};

//struct one_bit test[] = {{0x1}, {0x0}, {0x1}, {0x0}};

typedef struct one_bit boolean;


struct one_index{ 
        //int len;
        //int table_members;
        //int empty;
        //int search_num;
        std::bitset<65536> c;
	//std::bitset<256> d;
//8192 bytes = 65536 bits
	//std::bitset<12> b(3432);
	//std::valarray<bool> c[65536];
}pat_index;


//struct one_index *pat_index;

boolean mybool[4];




void ComputeArrayCPlusPlusSSE(
          float* pArray1,                   // [in] first source array

          float* pArray2,                   // [in] second source array

          float* pResult,                   // [out] result array

          int nSize)                        // [in] size of all arrays

{
    int nLoop = nSize/ 4;

	double printme;
    __m128 m1, m2, m3, m4;

    __m128* pSrc1 = (__m128*) pArray1;
    __m128* pSrc2 = (__m128*) pArray2;
    __m128* pDest = (__m128*) pResult;


    __m128 m0_5 = _mm_set_ps1(0.5f);        // m0_5[0, 1, 2, 3] = 0.5


    for ( int i = 0; i < nLoop; i++ )
    {
	//m1 = _mm_cmpeq_ss(*pSrc1, *pSrc1 );
//	printme = m1;
//printf("m1: %lf\n", printme);
        m1 = _mm_mul_ps(*pSrc1, *pSrc1);        // m1 = *pSrc1 * *pSrc1

        m2 = _mm_mul_ps(*pSrc2, *pSrc2);        // m2 = *pSrc2 * *pSrc2

        m3 = _mm_add_ps(m1, m2);                // m3 = m1 + m2

        m4 = _mm_sqrt_ps(m3);                   // m4 = sqrt(m3)

        *pDest = _mm_add_ps(m4, m0_5);          // *pDest = m4 + 0.5

        
        pSrc1++;
        pSrc2++;
        pDest++;
    }
//printme =  _mm_cvt_ss2si(m1);
//printf("m1: %lf\n", printme);

//return((int)m1);
}




float compare_series(const float arg1, const float arg2) {
   float dummy;


   __m128 coeff1 = _mm_load1_ps(&arg1);
   __m128 coeff2 = _mm_load1_ps(&arg2);
   _mm_store_ss(&dummy, _mm_cmpeq_ps(coeff1, coeff2));

   //_mm_store_ss(&dummy, _mm_rsqrt_ss(_mm_set_ss(arg)));
   return dummy;
}






float rsqrt(const float arg) {
   float dummy;
   _mm_store_ss(&dummy, _mm_set_ps1(4) );	
   //_mm_store_ss(&dummy, _mm_rsqrt_ss(_mm_set_ss(arg)));
   return dummy;
}




int main(){

//__m128 m1, m2, m3, m4;
//__m128 a=(float)4;
//__m128 b=(float)5;

int count;


float a[25];
float b[25];
float c[25];

for(count=0; count<25; count++){
	a[count]=2;
	b[count]=2;
	c[count]+=b[count];
	
}




//float foo = 2;
float result;
//result = rsqrt(foo);


float one = 5;
float two = 5;

compare_series(one, two);

one = 6;
printf("Result: %lf\n", result);

compare_series((float)one, (float)two);
printf("Result: %lf\n", result);


/*
__m128 bar1 = _mm_set1_ps(foo);
__m128 bar2 = _mm_load1_ps(&foo);
__m128 bar3 = _mm_set_ps(foo, foo, foo, foo);
__m128 bar4 = _mm_shufps_ps(_mm_load_ss(&foo), _mm_load_ss(&foo), 0);
*/






//ComputeArrayCPlusPlusSSE(a, b, c, 25);

//printf("count: %d\n", count);

/*
	for(a=0; a<256; a++){
		for(b=0; b<256; b++){
			pat_index.c[(a*256)+b]=1;
			//pat_index.d[a]=1;
		}
	}

        for(a=0; a<256; a++){
                for(b=0; b<256; b++){
                        pat_index.c[(a*256)+b]=0;
                        //pat_index.d[a]=1;
                }
        }
*/

//__m128 e =  _mm_cmpeq_ss((float)a , (float)b );

//printf("e: %f\n", (float)e);




    int no = 100, val ;
        asm ("movl %1, %%ebx;"
             "movl %%ebx, %0;"
             : "=r" ( val )        /* output */
             : "r" ( no )         /* input */
             : "%ebx"         /* clobbered register */
         );







printf("size:\n");
printf("\tone_bit: %lu\n", sizeof(struct one_bit));
printf("\tone_index: %lu\n", sizeof(struct one_index));
printf("\tchar: %lu\n", sizeof(char));
//printf("\ta: %lu\n", sizeof(a));

return 0;
}
