#include <stdbool.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <math.h>

//////////////////////////////////////////////////////////////////////
#define MAX_GROUPS         (64)
#define MAX_WORK_ITEMS     (64)
static int count        = 1024 * 1024;
//////////////////////////////////////////////////////////////////////
static char *
load_program_source(const char *filename)
{
    struct stat statbuf;
    FILE          *fh;
    char          *source;
    fh = fopen(filename, "r");
    if (fh == 0)
         return 0;
    stat(filename, &statbuf);
    source = (char *) malloc(statbuf.st_size + 1);
    fread(source, statbuf.st_size, 1, fh);
    source[statbuf.st_size] = '\0';
    return source;
}
//////////////////////////////////////////////////////////////////////
float reduce_float(float *data, int size)
{
    int i;
    float sum = data[0];
    float c = (float)0.0f;
    for (i = 1; i < size; i++)
    {
         float y = data[i] - c;
         float t = sum + y;
         c = (t - sum) - y;
         sum = t;
    }
    return sum;
}
//////////////////////////////////////////////////////////////////////
void create_reduction_pass_counts(
    int count, int max_groups, int max_work_items,
    int *level_count, size_t **group_counts,
    size_t **work_item_counts, int **entry_counts)
{
    int work_items = (count < max_work_items * 2) ? count / 2
                                                   : max_work_items;
    if (count < 1)
        work_items = 1;
    int groups = count / (work_items * 2);
    groups = max_groups < groups ? max_groups : groups;
    int max_levels = 1;
    int s = groups;
    while(s > 1)
    {
        int work_items = (s < max_work_items * 2) ? s / 2
                                                  : max_work_items;
        s = s / (work_items*2);
        max_levels++;
    }
    *group_counts = (size_t*)malloc(max_levels * sizeof(size_t));
    *work_item_counts = (size_t*)malloc(max_levels * sizeof(size_t));
    *entry_counts = (int*)malloc(max_levels * sizeof(int));
    (*level_count) = max_levels;
    (*group_counts)[0] = groups;
    (*work_item_counts)[0] = work_items;
    (*entry_counts)[0] = count;
    s = groups;
    int level = 1;
    while(s > 1)
    {
        int work_items = (s < max_work_items * 2) ? s / 2
                                                  : max_work_items;
        int groups = s / (work_items * 2);
        groups = (max_groups < groups) ? max_groups : groups;
        (*group_counts)[level] = groups;
        (*work_item_counts)[level] = work_items;
        (*entry_counts)[level] = s;
        s = s / (work_items*2);
        level++;
    }
}
//////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    cl_int            err;
    cl_device_id      device_id;
    cl_command_queue commands;
    cl_context        context;
    cl_mem            output;
    cl_mem            input;
    cl_mem            partials;
    int               level_count = 0;
    size_t*           group_counts = 0;
    size_t*           work_item_counts = 0;
    int*              entry_counts = 0;
    int i;
    // Create some random input data on the host
    //
    float *float_data = (float*)malloc(count * sizeof(float));
    for (i = 0; i < count; i++)
    {
         float_data[i] = ((float) rand() / (float) RAND_MAX);
    }
    // Connect to a GPU compute device
    //
    err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU,
                                  1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
         printf("Error: Failed to create a device group!\n");
         return EXIT_FAILURE;
    }
    // Create a compute context
    //
    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
         printf("Error: Failed to create a compute context!\n");
         return EXIT_FAILURE;
    }
    // Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
         printf("Error: Failed to create a command commands!\n");
         return EXIT_FAILURE;
    }
    // Load the compute program from disk into a cstring buffer
//
const char* filename = "reduce_kernel.cl";
printf("Loading program '%s'...\n", filename);
char *source = load_program_source(filename);
if(!source)
{
    printf("Error: Failed to load compute program from file!\n");
    return EXIT_FAILURE;
}
// Create the input buffer on the device
//
input = clCreateBuffer(context, CL_MEM_READ_WRITE,
                        sizeof(float) * count, NULL, NULL);
if (!input)
{
    printf("Error: Failed to allocate input data buffer on device!\n");
    return EXIT_FAILURE;
}
// Fill the input buffer with the host allocated random data
//
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0,
                  sizeof(float) * count, (void *)float_data,
                  0, NULL, NULL);
if (err != CL_SUCCESS)
{
    printf("Error: Failed to write to input data buffer on device!\n");
    return EXIT_FAILURE;
}
// Create an intermediate data buffer for intra-level results
//
partials = clCreateBuffer(context, CL_MEM_READ_WRITE,
                        sizeof(float) * count, NULL, NULL);
if (!partials)
{
    printf("Error: Failed to allocate partial sum buffer on device!\n");
    return EXIT_FAILURE;
}
// Create the output buffer on the device
//
output = clCreateBuffer(context, CL_MEM_READ_WRITE,
                        sizeof(float) * count, NULL, NULL);
if (!output)
{
    printf("Error: Failed to allocate result buffer on device!\n");
     return EXIT_FAILURE;
}
// Determine the global and local dimensions for the execution
//
create_reduction_pass_counts(count, MAX_GROUPS, MAX_WORK_ITEMS,
                              &level_count, &group_counts,
                              &work_item_counts, &entry_counts);
// Create programs and kernels for each level of the reduction
//
cl_program *programs =
  (cl_program*)malloc(level_count * sizeof(cl_program));
memset(programs, 0, level_count * sizeof(cl_program));
cl_kernel *kernels = (cl_kernel*)malloc(level_count *
                                          sizeof(cl_kernel));
memset(kernels, 0, level_count * sizeof(cl_kernel));
for(i = 0; i < level_count; i++)
{
     char *block_source = malloc(strlen(source) + 1024);
     size_t length = strlen(source) + 1024;
     memset(block_source, 0, length);
     // Insert macro definitions to specialize the kernel to a
    // particular group size
     //
     const char define[] = "#define GROUP_SIZE";
     sprintf(block_source, "%s (%d) \n%s\n", define,
                   (int)group_counts[i], source);
     // Create the compute program from the source buffer
     //
     programs[i] = clCreateProgramWithSource(context, 1,
                        (const char **) & block_source,
                        NULL, &err);
     if (!programs[i] || err != CL_SUCCESS)
     {
         printf("Error: Failed to create compute program!\n");
         return EXIT_FAILURE;
     }
     // Build the program executable
     //
     err = clBuildProgram(programs[i], 0, NULL, NULL, NULL, NULL);
     if (err != CL_SUCCESS)
     {
         size_t len;
         char buffer[2048];
        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(programs[i], device_id,
            CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        return EXIT_FAILURE;
    }
    // Create the compute kernel from within the program
    //
    kernels[i] = clCreateKernel(programs[i], "reduce", &err);
    if (!kernels[i] || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        return EXIT_FAILURE;
    }
    free(block_source);
}
// Do the reduction for each level
//
printf("Performing Reduction [%d]...\n", count);
cl_mem pass_swap = output;
cl_mem pass_input = output;
cl_mem pass_output = input;
for(i = 0; i < level_count; i++)
{
    size_t global = group_counts[i] * work_item_counts[i];
    size_t local = work_item_counts[i];
    unsigned int entries = entry_counts[i];
    printf("Pass[%4d] Global[%4d] Local[%4d] Groups[%4d] WorkItems[%4d] Entries[%d]\n", i,
            (int)global, (int)local, (int)group_counts[i],
            (int)work_item_counts[i], entries);
    // Swap the inputs and outputs for each pass
    //
    pass_swap = pass_input;
    pass_input = pass_output;
    pass_output = pass_swap;
    err = CL_SUCCESS;
    err |= clSetKernelArg(kernels[i], 0,
                             sizeof(cl_mem), &pass_output);
    err |= clSetKernelArg(kernels[i], 1,
                             sizeof(cl_mem), &pass_input);
    err |= clSetKernelArg(kernels[i], 2,
                        sizeof(float) * work_item_counts[i], NULL);
    err |= clSetKernelArg(kernels[i], 3,
                              sizeof(int),    &entries);
    if (err != CL_SUCCESS)
    {
         printf("Error: Failed to set kernel arguments!\n");
         return EXIT_FAILURE;
    }
    // After the first pass, use the partial sums for the
   // next input values
    //
    if(pass_input == input)
         pass_input = partials;
    err = CL_SUCCESS;
    err |= clEnqueueNDRangeKernel(commands, kernels[i], 1, NULL,
                              &global, &local, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
         printf("Error: Failed to execute kernel!\n");
         return EXIT_FAILURE;
    }
}
// Read back the final sum that was computed on the device
//
float computed_result = 0.0f;
err = clEnqueueReadBuffer(commands, pass_output, CL_TRUE, 0,
                    sizeof(float), &computed_result, 0, NULL, NULL);
if (err)
{
    printf("Error: Failed to read back results from the device!\n");
    return EXIT_FAILURE;
}
// Do our own reduction to compare the results
//
float reference = reduce_float(float_data, count);
// Verify the results are correct
//
float error = fabs(reference - computed_result);
// Report any incorrect results
//
if (error > 1)
{
	//1e-5
    printf("Reference %f != Device Result %f\n",
                        reference, computed_result);
    printf("Error: Incorrect results obtained! Max error = %f\n", error);
       return EXIT_FAILURE;
  }
  else
  {
       printf("Results Validated!\n");
  }
  // Shutdown and cleanup
  //
  for(i = 0; i < level_count; i++)
  {
       clReleaseKernel(kernels[i]);
       clReleaseProgram(programs[i]);
  }
  clReleaseMemObject(input);
  clReleaseMemObject(output);
  clReleaseMemObject(partials);
  clReleaseCommandQueue(commands);
  clReleaseContext(context);
  free(group_counts);
  free(work_item_counts);
  free(entry_counts);
  free(kernels);
  return 0;
}

