/*
 * later.inc -- LATency metER
 *
 * PD 2003 by Aces Hardware Community
 * all rights unreserved
 */


{
  u32 i, j, curr, delay, inisteps, div;
  u32 *memtab, *lintab, minitab[1], *alcmem, *alclin;
  static volatile u32 dummy;
  mytime_t t1, t2, t3, t4, tcal, trun;
  mytime_t ttable[MAX_REPEAT+1];

  fprintf(stderr, "initialising...\n");

  /* Creation of chainwalk tables */
  memtab = buildtab(tabsize, blocksize, stride, &alcmem); /* randomized inblock and then block by block walk*/
  lintab = buildtab(tabsize, stride, stride, &alclin); /* line by line walk */
  minitab[0] = 0; /* access the very same word all the time (perfect L1 hitter)*/

  if(memtab == NULL || lintab == NULL)
  {
    fprintf(stderr, "Out of memory!\n");
    exit(20);
  }

  fprintf(stderr, "warming up...\n");
  /* Dumb run to reset caches and so... */

  curr = 0;
  for(i=0; i<tabsize; i++)
  {
    curr = lintab[curr];
  }
  dummy = curr+lintab[0];
  div = curr+1; /* We know but compiler doesn't, that div is 1 */
                /* that way it will have no way to optimise out */
                /* divisions by it's value */

  inisteps = steps;

  fprintf(stderr, "measuring linear walk time...\n");
  /* first we measure time consumed for just bursting in the data */
  curr = 0;

  /* start the timer just after it ticks */
  t2 = gettime();
  do
  {
    t1 = gettime();
  } while(t1 == t2);

  /* and the linear (line by line) walk */
  for(i=0; i<inisteps; i++)
  {
    curr = lintab[curr];
  }

  /* get the time */
  t2 = gettime();
  dummy += curr; /* enforce dependencies */
  fprintf(stderr, "linear walk time is %ldms\n", (long)(t2-t1));

  /* Definition of delay blocks no follows */
  /* basic delay block is just two back to back */
  /* dependant divisions by one of value of */
  /* chainwalktable index -- obviously they */
  /* dont change it's value, but they */
  /* enforce dependence on it's previous */
  /* avlue and dependence of it's next value */
  /* over results of current value; so paralelisation */
  /* is efectively disabled and that's what we need */
# define DELAY		curr/=div; curr/=div
# define DELAY2		DELAY;DELAY
# define DELAY4		DELAY2;DELAY2
# define DELAY16	DELAY4;DELAY4;DELAY4;DELAY4
# define DELAY64	DELAY16;DELAY16;DELAY16;DELAY16
# define DELAY0		dummy+=curr

  fprintf(stderr, "measuring div time...\n");
  /* Then we look for how long it takes to do the same number */
  /* (same as linear walk steps) of chained delay blocks */
  curr = 0;
  t4 = gettime();
  do
  {
    t3 = gettime();
  } while(t3 == t4);

  for(i=0; i<inisteps; i++)
  {
    DELAY;
  }

  t4 = gettime();
  dummy += curr;

  fprintf(stderr, "div time is %ldms\n", (long)(t4-t3));

  /* delay factor is increased by 1 integer part of value: */
  /* how many times delay run was shorter than linera walk */
  delay = (u32)((t2-t1)/(t4-t3))+1;
  fprintf(stderr, "delay factor is %lu\n", (unsigned long)delay);
  /* We need those idling delays to be sure that we will */
  /* not start next memory access before memory subsystem */
  /* is ready to access it immediately (without additional */
  /* delays). It is in fact only for some corner cases, and */
  /* might even not be required at all, but its generally */
  /* good idea to have ones ass covered */

  /* A this point we can do actual test */
  /* First we go for 10 to 100 calibration runs */
  /* then 10 to 100 of actual measurement runs */
  /* the condition of ending repetition of runs */
  /* (both calibration and actual measurement) */
  /* is that there are either 100 particular runs */
  /* done or there are at least 10 and 3 fastets */
  /* are all within predetermined range of each other */
  tcal = INT_MAX;
  trun = INT_MAX;

  /* Measuring code block performs choses apropraite */
  /* measurement loop according to delay factor */
  /* for both calibration and measurement) the */
  /* same loop is chosen, the only difference is */
  /* the inner code executed -- either being */
  /* calibration code (empty or just L1 cache access */
  /* -- depending on CALIBRATOR_CODE macro) */
  /* or code doing actual randomised memory access */
# define MEASURING_BLOCK(code) \
    \
    switch(delay) \
    { \
    case 0: \
      for(i=0; i<steps; i++) \
      { \
        code; \
        DELAY0; \
      } \
      break; \
    \
    case 1: \
      for(i=0; i<steps; i++) \
      { \
        code; \
        DELAY; \
      } \
      break; \
    \
    case 2: \
      for(i=0; i<steps; i++) \
      { \
        code; \
        DELAY2; \
      } \
      break; \
    \
    case 3: \
    case 4: \
      for(i=0; i<steps; i++) \
      { \
        code; \
        DELAY4; \
      } \
      break; \
    \
    case 5: \
    case 6: \
    case 7: \
    case 8: \
      for(i=0; i<steps; i++) \
      { \
        code; \
        DELAY4; \
        DELAY4; \
      } \
      break; \
    \
    case 9: \
    case 10: \
    case 11: \
    case 12: \
    case 13: \
    case 14: \
    case 15: \
    case 16: \
      for(i=0; i<steps; i++) \
      { \
        code; \
        DELAY16; \
      } \
      break; \
    \
    default: \
      if(delay <= 64) \
      { \
        for(i=0; i<steps; i++) \
        { \
          code; \
          DELAY64; \
        } \
      } \
      else \
      { \
        fprintf(stderr, "Unsuported delay factor %lu\n", (unsigned long)delay); \
        exit(10); \
      } \
    }

  /* clean partial results table*/
  memset((void*)ttable, 0, sizeof(mytime_t)*(MAX_REPEAT+1));

  /* repeat calibration runs */
  for(j=0; j<MAX_REPEAT; j++)
  {
    fprintf(stderr, "measuring delay walk time (%u)...\n", (unsigned)j);
    curr = 0;

    /* start timer after it ticks */
    t2 = gettime();
    do
    {
      t1 = gettime();
    } while(t1 == t2);

    /* do the calibration run */
    MEASURING_BLOCK(CALIBRATOR_CODE);

    /* get the run time */
    t2 = gettime();

    /* enforce dependency */
    dummy += curr;

    /* calculate the run time and store it in ranking table */
    t2 = t2-t1;
    if(t2 < tcal)
      tcal = t2;
    inserttime(ttable, t2);

    fprintf(stderr, "delay time (%u) is %ldms\n", (unsigned)j, (long)(t2));
    /* if early stop condition is met, i.e. minimum number of runs is finished */
    /* and best three results are within error margin then stop */
    if(j>=MIN_REPEAT)
    {
      if(((u64)ttable[2])*((u64)(10000/prec)) / ((u64)ttable[0]) == (u64)(10000/prec))
      {
        break;
      }
    }
  }
  /* Now we've got calibration time (i.e. tara time) */
  fprintf(stderr, "minimal delay time is %ldms\n", (long)(tcal));

  /* Clean the ranking table once more */
  memset((void*)ttable, 0, sizeof(mytime_t)*(MAX_REPEAT+1));

  /* And do all the measurement runs */
  for(j=0; j<MAX_REPEAT; j++)
  {
    /* This is very like calibration run, the only difference being */
    /* inner code snippet executed */
    fprintf(stderr, "measuring randomised walk time (%u)...\n", (unsigned)j);
    curr = 0;
    t4 = gettime();
    do
    {
      t3 = gettime();
    } while(t3 == t4);

    MEASURING_BLOCK(curr = memtab[curr]);

    t4 = gettime();
    dummy += curr;

    t4 = t4-t3;
    if(t4 < trun)
      trun = t4;
    inserttime(ttable, t4);

    fprintf(stderr, "randomised walk time (%u) is %ldms\n", (unsigned)j, (long)(t4));
    if(j>=MIN_REPEAT)
    {
      if(((u64)ttable[2])*((u64)(10000/prec)) / ((u64)ttable[0]) == (u64)(10000/prec))
      {
        break;
      }
    }
  }
  /* Now we've got actual measurement time (i.e. brutto time) */
  fprintf(stderr, "minimal randomised walk time is %ldms\n", (long)(trun));

  /* Clean up */
  free((void*)alclin);
  free((void*)alcmem);

  /* Actual time spent on accessing memory, and not on idling delays */
  /* nor control flow instructions and possibly some more additional*/
  /* code is the difference between actual measuring run time (the */
  /* best one) and calibration time (also the best one). In other */
  /* words netto = brutto - tara (and that's what we want) */
  return trun-tcal;
}
