 #include <iostream>
 #include <assert.h>
 #include <limits.h>
#include <alloca.h>
#include <stdlib.h>



#pragma comment(lib, "opencl")
//#include <windows.h>
#include <vector>
#include <string>
#include <fstream>
#include <iostream>
#include <cmath>
#include <cassert>
#include <sys/types.h>
#include <alloca.h>


#define __CL_ENABLE_EXCEPTIONS
//#include <cl/cl.hpp>
#include <CL/cl.h>

typedef unsigned int uint;

const size_t NumLoops = 5000;
const size_t NumElements = 1<< 20;

template<typename T>
bool DumpBuffer(cl::CommandQueue& queue, cl::Buffer& buffer, std::vector<T>& data) {
   size_t size;
   buffer.getInfo(CL_MEM_SIZE, &size);

   if(size % sizeof(T)) return false;

   data.resize(size / sizeof(T));
   queue.enqueueReadBuffer(buffer, true, 0, size, &data[0]);
   return true;
}

class PrefixSumCache {
public:
   size_t Capacity() const { return _capacity; }
   cl::Buffer& BaseLevel() { return _levels[0]; }

private:
   friend class CLPrefixSum;

   std::vector<cl::Buffer> _levels;
   size_t _capacity, _valuesPerGroup;
};

class CLPrefixSum {
public:
   void CreatePrefixSumCache(size_t capacity, std::auto_ptr<PrefixSumCache>* ppCache);
   void Execute(cl::CommandQueue& queue, size_t numElements, PrefixSumCache* cache);

private:
   friend void CreateCLPrefixSum(cl::Context& context, size_t valuesPerGroup, size_t threadsPerGroup,
      cl::Kernel& pass1, cl::Kernel& finalize, cl::Kernel& pass2,
      std::auto_ptr<CLPrefixSum>* ppPrefixSum);

   static void LevelCounts(size_t numElements, size_t valuesPerGroup, 
      std::vector<size_t>& levelCounts);

   cl::Context _context;
   cl::Kernel _pass1, _finalize, _pass2;
   size_t _valuesPerGroup, _threadsPerGroup;
};

void CreateCLPrefixSum(cl::Context& context, size_t valuesPerGroup, size_t threadsPerGroup,
   cl::Kernel& pass1, cl::Kernel& finalize, cl::Kernel& pass2,
   std::auto_ptr<CLPrefixSum>* ppPrefixSum) {

   std::auto_ptr<CLPrefixSum> prefixSum(new CLPrefixSum);
   prefixSum->_context = context;
   prefixSum->_pass1 = pass1;
   prefixSum->_finalize = finalize;
   prefixSum->_pass2 = pass2;
   prefixSum->_valuesPerGroup = valuesPerGroup;
   prefixSum->_threadsPerGroup = threadsPerGroup;
   *ppPrefixSum = prefixSum;
}

void CLPrefixSum::LevelCounts(size_t numElements, size_t valuesPerGroup,
   std::vector<size_t>& levelCounts) {

   // write the dispatch count for each pass1/pass2 level.
   while(numElements > valuesPerGroup) {
      numElements = (numElements + valuesPerGroup - 1) / valuesPerGroup;
      levelCounts.push_back(numElements);
   }
}

void CLPrefixSum::CreatePrefixSumCache(size_t capacity, std::auto_ptr<PrefixSumCache>* ppCache) {
   std::auto_ptr<PrefixSumCache> cache(new PrefixSumCache);
   cache->_capacity = capacity;
   cache->_valuesPerGroup = _valuesPerGroup;

   std::vector<size_t> levelCounts;
   LevelCounts(capacity, _valuesPerGroup, levelCounts);

   capacity = ~(_valuesPerGroup - 1) & (capacity + _valuesPerGroup - 1);
   cache->_levels.push_back(cl::Buffer(_context, CL_MEM_READ_WRITE, 4 * capacity));

   for(size_t i(0); i < levelCounts.size(); ++i) {
      size_t count = levelCounts[i];
      capacity = ~(_valuesPerGroup - 1) & (count + _valuesPerGroup - 1);
      cache->_levels.push_back(cl::Buffer(_context, CL_MEM_READ_WRITE, 4 * capacity));
   }
   
   *ppCache = cache;
}

void CLPrefixSum::Execute(cl::CommandQueue& queue, size_t numElements,
   PrefixSumCache* cache) {

   assert(cache->_valuesPerGroup == _valuesPerGroup);

   std::vector<size_t> levelCounts;
   LevelCounts(numElements, _valuesPerGroup, levelCounts);
   
   for(size_t level(0); level < levelCounts.size(); ++level) {
      cl::KernelFunctor f(_pass1, queue, cl::NDRange(), levelCounts[level] * _threadsPerGroup,
         _threadsPerGroup);
      f(cache->_levels[level], cache->_levels[level + 1]);
   }

   cl::KernelFunctor f(_finalize, queue, cl::NDRange(), _threadsPerGroup, _threadsPerGroup);
   f(cache->_levels.back());

   for(int level(levelCounts.size() - 1); level >= 0; --level) {
   cl::KernelFunctor f(_pass2, queue, cl::NDRange(), levelCounts[level] * _threadsPerGroup,
         _threadsPerGroup);
      f(cache->_levels[level], cache->_levels[level + 1]);
   }
}

int main(int argc, char** argv) {
   // read in the file
   std::ifstream f("scan.cl");
   std::string s(std::istreambuf_iterator<char>(f), std::istreambuf_iterator<char>(0));

   std::vector<cl::Platform> platforms;
   cl::Platform::get(&platforms);

   std::vector<cl::Device> devices;
   platforms.front().getDevices(CL_DEVICE_TYPE_GPU, &devices);

   cl_context_properties props[3] = {
      CL_CONTEXT_PLATFORM,
      (cl_context_properties)platforms[0](),
      0
   };
   cl::Context context(CL_DEVICE_TYPE_GPU, props);

   cl::CommandQueue commandQueue(context, devices[0]);

   cl::Program program(context, cl::Program::Sources(1, std::make_pair(s.c_str(), s.size())));

   int err = program.build(devices, "-D NUM_LEVELS=7");

   if(err) {
      std::string buildError;
      program.getBuildInfo(devices[0], CL_PROGRAM_BUILD_LOG, &buildError);
      std::cout<< buildError;
      return 0;
   }

   cl::Kernel pass1(program, "PrefixSumBlock_Pass1", &err);
   cl::Kernel finalize(program, "PrefixSumBlock_Finalize", &err);
   cl::Kernel pass2(program, "PrefixSumBlock_Pass2", &err);

   const int NumLevels = 7;
   const int NumThreads = 1<< NumLevels;
   const int ValuesPerThread = 8;
   const int ValuesPerGroup = 8 * NumThreads;

   std::auto_ptr<CLPrefixSum> prefixSum;
   CreateCLPrefixSum(context, ValuesPerGroup, NumThreads, pass1, finalize,
      pass2, &prefixSum);

   std::auto_ptr<PrefixSumCache> prefixSumCache;
   prefixSum->CreatePrefixSumCache(NumElements, &prefixSumCache);

   std::vector<uint> values(NumElements, 1);

   commandQueue.enqueueWriteBuffer(prefixSumCache->BaseLevel(), true, 0, 
      NumElements * 4, &values[0]);
   commandQueue.finish();

   LARGE_INTEGER freq;
      LARGE_INTEGER begin, end;
   QueryPerformanceFrequency(&freq);
   double period = 1.0 / freq.QuadPart;

   for(int j(0); j < 10; ++j) {
      QueryPerformanceCounter(&begin);

      for(int i(0); i < NumLoops; ++i)
         prefixSum->Execute(commandQueue, NumElements, prefixSumCache.get());
         
      std::vector<uint> values2;
      DumpBuffer(commandQueue, prefixSumCache->BaseLevel(), values2);
      QueryPerformanceCounter(&end);

      double elapsed = period * (end.QuadPart - begin.QuadPart);

      double velocity = NumElements * (NumLoops / elapsed);
      printf("GPU velocity: %1.3fM\n", velocity / 1.0e6);
   }
}
