#include <sys/mman.h>

void *mmap(void *addr, size_t len, int prot, int flags,
       int fildes, off_t off);

#include <stdlib.h>

int posix_memalign(void **memptr, size_t alignment, size_t size);


#ifdef _PPC_

{ put a really optimized PPC version of the routine here

#else

{ plain vanilla version }

#endif

should be written both to guard it and simple so that anybody reading the code can see for-what this is intended


// DP_sum computes the sum of an array of floats.  As written, this routine is 
// serially-dependent and generally very slow unless a compiler can optimize it 
// to the later forms....
//
// This is a trivial routine and contributed to the public domain
static
INT32BITS DP_sum( FLOAT64BITS      x[], 
                  FLOAT64BITS      unused[],
                  FLOAT64BITS      out[], 
         unsigned POINTERSIZEINT   N,
                  INT32BITS        unused_ ) {

unsigned POINTERSIZEINT i;
FLOAT64BITS sum =0.0;

for (i = 0; i < N; i++) { 
   sum += x[i];   
   }
out[0] = sum;
return 0;
}


// DP_sumU16 introduces 16 intermediate summing registers so that
// the additions can be unrolled and deserialized.  However it 
// doesn't explicitly deal with load latencies in any way. 
//
// This routine is contributed to the public domain                                
static
INT32BITS DP_sumU16( register FLOAT64BITS     * x, 
                     register FLOAT64BITS     * unused,
                     register FLOAT64BITS     * out, 
        register     unsigned POINTERSIZEINT  N,
                     unsigned INT32BITS       unused2) {

register POINTERSIZEINT i;

FLOAT64BITS sum0 , sum1, sum2, sum3, sum4, sum5, sum6, sum7,
            sum8 , sum9, sumA, sumB, sumC, sumD, sumE, sumF;

i  = N >> 4; // this is the work-loop count
N -= i << 4; // remainder

sum0 = 0;
if (i) { i--;
         sum0 = *x++;  sum1 = *x++;  sum2 = *x++;  sum3 = *x++;
         sum4 = *x++;  sum5 = *x++;  sum6 = *x++;  sum7 = *x++;
         sum8 = *x++;  sum9 = *x++;  sumA = *x++;  sumB = *x++;
         sumC = *x++;  sumD = *x++;  sumE = *x++;  sumF = *x++;
         while (i) { i--;
            sum0 += *x++;  sum1 += *x++;  sum2 += *x++;  sum3 += *x++;
            sum4 += *x++;  sum5 += *x++;  sum6 += *x++;  sum7 += *x++;
            sum8 += *x++;  sum9 += *x++;  sumA += *x++;  sumB += *x++;
            sumC += *x++;  sumD += *x++;  sumE += *x++;  sumF += *x++;
            }
            
         sum0 += sum1; sum2 += sum3; sum4 += sum5; sum6 += sum7;
         sum8 += sum9; sumA += sumB; sumC += sumD; sumE += sumF;
         
         sum0 += sum2; sum4 += sum6; sum8 += sumA, sumC += sumE;
         
         sum0 += sum4;  sum8 += sumC ;
         
         sum0 += sum8;
      }
      
while (N) { N--;  sum0 += *x++; }   // clean up any tail
*out = sum0;
return N;  // return zero
}
                                
// DP_sumU16_i does loop inversion in addition  to unrolling, so
// that loads can be hoisted by 16 adds... and note that in the
// case of Power4/970/IA_64 this is still only 8 cycles.   
//
// This routine also uses address+offset pointer motion; this 
// will not disadvantage machines without this as an intrinsic 
// compared to simple ++ 
//
// This routine is contributed to the public domain                                
                                
static
INT32BITS DP_sumU16_i( register FLOAT64BITS     * x, 
                       register FLOAT64BITS     * unused,
                       register FLOAT64BITS     * out, 
         register      unsigned POINTERSIZEINT  N,
                       unsigned INT32BITS       unused2) {

register POINTERSIZEINT i;

// register 32 floats...
 register FLOAT64BITS L0 , L1, L2, L3, L4, L5, L6, L7,
                      L8 , L9, LA, LB, LC, LD, LE, LF,
            
                      sum0 , sum1, sum2, sum3, sum4, sum5, sum6, sum7,
                      sum8 , sum9, sumA, sumB, sumC, sumD, sumE, sumF;

  register int Poff1 =     sizeof(FLOAT64BITS);
  register int Poff2 =  2* sizeof(FLOAT64BITS);
  register int Poff3 =  3* sizeof(FLOAT64BITS);
  register int Poff4 =  4* sizeof(FLOAT64BITS);
  register int Poff5 =  5* sizeof(FLOAT64BITS);
  register int Poff6 =  6* sizeof(FLOAT64BITS);
  register int Poff7 =  7* sizeof(FLOAT64BITS);
  register int Poff8 =  8* sizeof(FLOAT64BITS);
  register int Poff9 =  9* sizeof(FLOAT64BITS);
  register int PoffA = 10* sizeof(FLOAT64BITS);
  register int PoffB = 11* sizeof(FLOAT64BITS);
  register int PoffC = 12* sizeof(FLOAT64BITS);
  register int PoffD = 13* sizeof(FLOAT64BITS);
  register int PoffE = 14* sizeof(FLOAT64BITS);
  register int PoffF = 15* sizeof(FLOAT64BITS);
 

i  = N >> 4; // this is the work-loop count
N -= i << 4; // remainder

sum0 = 0;
 
if (i) { i--;
         L0 = *x;                                        
         L1 = *( (FLOAT64BITS *) ( (char*) x + Poff1));   sum1 = 0;
         L2 = *( (FLOAT64BITS *) ( (char*) x + Poff2));   sum2 = 0;
         L3 = *( (FLOAT64BITS *) ( (char*) x + Poff3));   sum3 = 0;
         L4 = *( (FLOAT64BITS *) ( (char*) x + Poff4));   sum4 = 0;
         L5 = *( (FLOAT64BITS *) ( (char*) x + Poff5));   sum5 = 0;
         L6 = *( (FLOAT64BITS *) ( (char*) x + Poff6));   sum6 = 0;
         L7 = *( (FLOAT64BITS *) ( (char*) x + Poff7));   sum7 = 0;
         L8 = *( (FLOAT64BITS *) ( (char*) x + Poff8));   sum8 = 0;
         L9 = *( (FLOAT64BITS *) ( (char*) x + Poff9));   sum9 = 0;
         LA = *( (FLOAT64BITS *) ( (char*) x + PoffA));   sumA = 0;
         LB = *( (FLOAT64BITS *) ( (char*) x + PoffB));   sumB = 0;
         LC = *( (FLOAT64BITS *) ( (char*) x + PoffC));   sumC = 0;
         LD = *( (FLOAT64BITS *) ( (char*) x + PoffD));   sumD = 0;
         LE = *( (FLOAT64BITS *) ( (char*) x + PoffE));   sumE = 0;
         LF = *( (FLOAT64BITS *) ( (char*) x + PoffF));   sumF = 0;   x+= 16;
     

         while (i) { i--;
             sum0 += L0; L0 = *x;
             sum1 += L1; L1 = *( (FLOAT64BITS *) ( (char*) x + Poff1));
             sum2 += L2; L2 = *( (FLOAT64BITS *) ( (char*) x + Poff2));
             sum3 += L3; L3 = *( (FLOAT64BITS *) ( (char*) x + Poff3));
             sum4 += L4; L4 = *( (FLOAT64BITS *) ( (char*) x + Poff4));
             sum5 += L5; L5 = *( (FLOAT64BITS *) ( (char*) x + Poff5));
             sum6 += L6; L6 = *( (FLOAT64BITS *) ( (char*) x + Poff6));
             sum7 += L7; L7 = *( (FLOAT64BITS *) ( (char*) x + Poff7));
             sum8 += L8; L8 = *( (FLOAT64BITS *) ( (char*) x + Poff8));
             sum9 += L9; L9 = *( (FLOAT64BITS *) ( (char*) x + Poff9));
             sumA += LA; LA = *( (FLOAT64BITS *) ( (char*) x + PoffA));
             sumB += LB; LB = *( (FLOAT64BITS *) ( (char*) x + PoffB));
             sumC += LC; LC = *( (FLOAT64BITS *) ( (char*) x + PoffC));
             sumD += LD; LD = *( (FLOAT64BITS *) ( (char*) x + PoffD));
             sumE += LE; LE = *( (FLOAT64BITS *) ( (char*) x + PoffE));
             sumF += LF; LF = *( (FLOAT64BITS *) ( (char*) x + PoffF)); x+= 16;
            }
            
         sum0 += L0; sum1 += L1; sum2 += L2; sum3 += L3;      
         sum4 += L4; sum5 += L5; sum6 += L6; sum7 += L7;     
         sum8 += L8; sum9 += L9; sumA += LA; sumB += LB;
         sumC += LC; sumD += LD; sumE += LE; sumF += LF;  
            
         sum0 += sum1; sum2 += sum3; sum4 += sum5; sum6 += sum7;
         sum8 += sum9; sumA += sumB; sumC += sumD; sumE += sumF;
         
         sum0 += sum2; sum4 += sum6; sum8 += sumA, sumC += sumE;
         
         sum0 += sum4;  sum8 += sumC ;
         
         sum0 += sum8;
      }
      
while (N) { N--;  sum0 += *x++; }   // clean up any tail
*out = sum0;
return N;  // return zero
}


