/*
 * later.c -- LATency metER
 * ver. 0.6
 *
 * Put into Public Domain in 2003 by Aces Hardware Community
 *
 * Contributors:
 * Bamsen
 * Eric Bron
 * Just Brew It
 * Sebastian
 * NoSpammer
 *
 * all rights unreserved
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <limits.h>

/*
 * Some OS abstraction
 */
#ifdef _WIN32
#include <windows.h>
unsigned long gettime()
{
  return GetTickCount();
}

int nice(int n)
{
  HANDLE thread;

  if(n<0)
  {
    thread = GetCurrentThread();
    SetThreadPriority(thread, THREAD_PRIORITY_TIME_CRITICAL);
  }
  return 0;
}

size_t getpagesize()
{
  SYSTEM_INFO info;

  GetSystemInfo(&info);
  return (size_t)info.dwPageSize;
}
#else
#include <sys/time.h>
#include <unistd.h>
unsigned long gettime()
{
  struct timeval tv;
  gettimeofday(&tv, NULL);
  unsigned long long temp = tv.tv_sec * 1000 + (tv.tv_usec / 1000);
  return (unsigned long) (temp & 0xffffffff);
}
#endif

/*
 * Some machine/compiler abstraction
 */
typedef unsigned long mytime_t;
typedef unsigned int u32;
typedef int i32;
#ifdef _MSC_VER
typedef unsigned __int64 u64;
typedef __int64 i64;
#else
typedef unsigned long long int u64;
typedef long long int i64;
#endif


/*
 * (Pseudo)random number stuff, it has nice property of walking over
 * all the numbers (and without repetitions) within the range
 */
typedef struct randstate_
{
  u64 x, a, c, m;
} RANDSTATE;


/*
 *  lim must be 2^expo, where expo is any number from 0 to 32
 */
RANDSTATE randinit(u32 seed, u64 lim)
{
  RANDSTATE r;

  r.x = seed;
  r.m = lim;

  if(lim>=128)
  {
    r.a = ((r.m >> 2) + (r.m >> 1) + (r.m >> 5) + 25);
    r.c = ((r.m >> 2) + (r.m >> 3) + (r.m >> 6) + 9);
  }
  else
  {
    r.a = 9 % lim;
    r.c = 5 % lim;
  }
  return r;
}


u32 rnd(RANDSTATE *r)
{
  return (u32)(r->x = (r->a * r->x + r->c) % r->m);
}


/*
 *  Reverse bits in a number smaller than 2*lim
 */
u32 revbits(u32 val, u32 lim)
{
  u32 i, v;

  v = 0;
  for(i=1; i<lim; i<<=1)
  {
    v <<= 1;
    v |= val&1;
    val >>= 1;
  }
  return v;
}


/*
 *  Build chain-walk table. Do it in such way that
 *  all the entries are visited in pseudorandom
 *  way, but accesses within the same cache line
 *  (which's lenght is designated by stride) are
 *  set as long apart as they can, effectively
 *  disabling the possibility, that the same cache
 *  line would be still present when next access
 *  of it occurs (given that table size is more
 *  than 2 times greater than cache).
 */
u32* buildtab(u32 size, u32 subblock, u32 stride, u32 **alc)
{
  u32 lim, n, step, i, j, k, curr, next, base, modul, nextbase;
  size_t psize;
  u32 *tab;
  RANDSTATE r;

  n = size/subblock;
  lim = subblock/stride;
  step = stride/sizeof(u32);
  modul = size/sizeof(u32);

  r = randinit(0, lim);

  psize = getpagesize();
  tab = (u32*)malloc(size+psize);
  if(alc != NULL)
    *alc = tab;
  if(tab == NULL)
    return NULL;
  if(sizeof(u32*) <= sizeof(u32))
    tab = (u32*)((((u32)tab)+psize)&(~((u32)(psize-1))));
  else
    tab = (u32*)((((u64)tab)+psize)&(~((u64)(psize-1))));
  /* tab is now page aligned */

  memset((void*)tab, 213, size);

  curr = 0;
  base = 0;
  nextbase = 0;
  for(j=0; j<n; j++)
  {
    for(i=0; i<lim-1; i++)
    {
      next = rnd(&r);
      for(k=0; k<step; k++)
      {
        tab[base+curr*step+revbits(k, step)] = base+next*step+revbits(k, step);
      }
      curr = next;
    }
    if(j<n-1)
      nextbase = base+lim*step;
    else
      nextbase = 0;
    next = rnd(&r);
    for(k=0; k<step; k++)
    {
      tab[base+curr*step+revbits(k, step)] = nextbase+next*step+revbits(k, step);
    }
    curr = next;
    base = nextbase;
  }

  return tab;
}


#define MAX_REPEAT	99
#define MIN_REPEAT	10

/*
 *  Insert timing result into table, keeping
 *  table sorted
 */
void inserttime(mytime_t *ttable, mytime_t val)
{
  size_t i;

  for(i=MAX_REPEAT; i>0; i--)
  {
    if(val<ttable[i-1] || ttable[i-1]==0)
    {
      ttable[i] = ttable[i-1];
    }
    else
    {
      break;
    }
  }
  ttable[i] = val;
}


/*
 *  Measure total idle memory latency as visible to applications
 */
#define CALIBRATOR_CODE ;
mytime_t walktab_fullacc(u32 steps, u32 tabsize, u32 blocksize, u32 stride, u32 prec)
#include "later.inc"
#undef CALIBRATOR_CODE

/*
 *  Measure idle memory latency, excluding L1 access (but this might be more precise)
 */
#define CALIBRATOR_CODE curr = minitab[curr]
mytime_t walktab_nol1(u32 steps, u32 tabsize, u32 blocksize, u32 stride, u32 prec)
#include "later.inc"
#undef CALIBRATOR_CODE


/*
 * Here we start
 */
int main(int argc, char *argv[])
{
  u32 blocksize, stride, prec, steps;
  i64 t;

  if(sizeof(u32) < 4)
  {
    fprintf(stderr, "Sorry, your machine/compiler combo is not suitable for this benchmark\n");
    exit(20);
  }

  fprintf(stderr, "Later -- memory latency measuring tool, ver. 0.6\nPD 2003 by Aces Hardware Community\n\n");

  if(argc < 2)
  {
    printf("usage: later <blocksize in KB> [<speedup factor> [<minimal stride>]]\n");
    printf(
    "blocksize must be a positive integer power of 2;\n"
    "speedup factor must be a positive integer and it shortens benchmark run time\n"
    "but at the same time negatively affects accuracy, default value is 5;\n"
    "minimal stride must be a positive integer power of 2 and it defines minimal\n"
    "difference between consequtively vistited mem addresses -- it must be greater\n"
    "than longest cacheline of the CPU, otherwise measurement will be wrong, best\n"
    "idea is to leve it at it's default 256.\n\n"
    "ranges for parameters are...\n"
    "blockisize -- 4 to 32768 (only powers of 2)\n"
    "speedup factor -- 1 to 50 (just integer numbers)\n"
    "minimal stride -- 8 to 1024 (only powers of two)\n\n"
    );
    printf(
    "Contributors (in alphabetical order):\n"
    "Bamsen, Eric Bron, Just Brew It, Sebastian Kaliszewski, NoSpammer\n\n"
    );
    exit(0);
  }

  blocksize = atol(argv[1])*1024;

  if(argc > 2)
    prec = atol(argv[2]);
  else
    prec = 20;

  if(argc > 3)
    stride = atol(argv[3]);
  else
    stride = 256;

  if((argc > 4) ||
     (stride<8 || stride>1024 || (stride&(stride-1))!=0) ||
     (blocksize<4*1024 || blocksize>32768*1024 || (blocksize&(blocksize-1))!=0) ||
     (prec<1 || prec>50)
    )
  {
    fprintf(stderr, "wrong arguments\n");
    exit(10);
  }

  fprintf(stderr, "parameters: blocksize=%luB, speedup=%ux, stride=%uB\n",
                  (unsigned long)blocksize, (unsigned)prec, (unsigned)stride);

  printf("Heuristically (very roughly) estimated expected result accuracy is: %lu.%02lu%%\n",
         100ul/(2000ul/prec), (10000ul/(2000ul/prec))%100ul);
  steps = 100000000 / prec;

  /*
   * Try to take over the system (i.e. increase priority)
   */
  nice(-19);

  /*
   * Total idle memory latency -- full result, but slightly less reliable
   */
  t = walktab_fullacc(steps, (u32)(1ul<<25ul), blocksize, stride, prec);
  fprintf(stderr, "\nMeasured total idle memory latency for %luKB blocking is: %lu.%02luns\n",
         (unsigned long)blocksize/1024, (unsigned long)(t*((i64)1000000)/steps), (unsigned long)((t*((i64)100000000)/steps)%100));
  printf("Measured total idle memory latency for %luKB blocking is: %lu.%02luns\n",
         (unsigned long)blocksize/1024, (unsigned long)(t*((i64)1000000)/steps), (unsigned long)((t*((i64)100000000)/steps)%100));

  /*
   * Idle memory latency, with L1 timings not counted -- but should be a bit more reliable
   */
  t = walktab_nol1(steps, (u32)(1ul<<25ul), blocksize, stride, prec);
  fprintf(stderr, "\nMeasured idle memory latency w/o L1 for %luKB blocking is: %lu.%02luns\n",
         (unsigned long)blocksize/1024, (unsigned long)(t*((i64)1000000)/steps), (unsigned long)((t*((i64)100000000)/steps)%100));
  printf("Measured idle memory latency w/o L1 for %luKB blocking is: %lu.%02luns\n",
         (unsigned long)blocksize/1024, (unsigned long)(t*((i64)1000000)/steps), (unsigned long)((t*((i64)100000000)/steps)%100));

  fprintf(stderr, "\n");

  return 0;
}
