#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cstring>
#include <sys/stat.h>
#include <sys/types.h>
#include <CL/cl.h>
#include <unistd.h>

#define ARRAY_EL 512

        float                 *input1 = (float*)malloc(ARRAY_EL * sizeof(float));
        float                 *input2 = (float*)malloc(ARRAY_EL * sizeof(float));
        float                 *output = (float*)malloc(ARRAY_EL * sizeof(float));
	cl_mem cl_input1;
	cl_mem cl_input2;
	cl_mem cl_output;
int testing;

static char *
load_program_source(const char *filename)
{
    struct stat statbuf;
    FILE          *fh;
    char          *source;
    fh = fopen(filename, "r");
    if (fh == 0)
         return 0;
    stat(filename, &statbuf);
    source = (char *) malloc(statbuf.st_size + 1);
    fread(source, statbuf.st_size, 1, fh);
    source[statbuf.st_size] = '\0';
    return source;
}

static void
free_memory(void)
{
        printf("freeing memory objects!\n");
         
        clReleaseMemObject(cl_input1);
        clReleaseMemObject(cl_input2);
        clReleaseMemObject(cl_output);

        //clReleaseKernel (kernels);
        //clReleaseProgram (programs);
        
        free(input1);
        free(input2);
        free(output);
                
        printf("all memory freed!\n");
}



int main(int argc, const char** argv)
{

	char                  Buffer[1024];
	char                  Temp[2];
	cl_uint               ciDeviceCount = 1;
	cl_device_id          *devices;
	unsigned int          i;
	cl_device_id          *device_id;	
	cl_command_queue      commands;	
	cl_context            GPUcontext;
	const char*           filename = "alpha.cl";
	//char                  *source;
	//cl_mem                input;
	//cl_mem                output;
	//float                 *input1 = (float*)malloc(1000000 * sizeof(float));
        //float                 *input2 = (float*)malloc(1000000 * sizeof(float));
        //float                 *output = (float*)malloc(1000000 * sizeof(float));
	cl_platform_id        *context_platform;

	for (i = 0; i < ARRAY_EL; i++)
	{
		input1[i] = ((float) rand() / (float) RAND_MAX/2);
                input2[i] = ((float) rand() / (float) RAND_MAX/2);
                output[i] = ((float) rand() / (float) RAND_MAX/2);
	}


output[0] = 150;
//        for(i=0; i < 10; i++) {
//                printf("element %d: %g\n", i, input1[i]);
//        }



	cl_platform_id clSelectedPlatformID = NULL;

	cl_int ciErrNum = clGetPlatformIDs (ciDeviceCount, &clSelectedPlatformID, &ciDeviceCount);
	if (ciErrNum != CL_SUCCESS)
	{
		printf("clGetPlatformIDs Failed! (return code %i)\n", ciErrNum);
		free_memory();
		exit(1);
	}

	
	ciErrNum = clGetDeviceIDs (clSelectedPlatformID, CL_DEVICE_TYPE_ALL, 0, NULL, &ciDeviceCount);
	if (ciDeviceCount == 0)
	{
		printf("No devices found supporting OpenCL (return code %i)\n\n", ciErrNum);
		free_memory();
		exit(1);
	}else if (ciErrNum != CL_SUCCESS){
		 printf("Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum);
		free_memory();
		exit(1);
	}else{

	printf("ciDeviceCount: %x\n", ciDeviceCount);

		if ((devices = (cl_device_id*)malloc(sizeof(cl_device_id) * ciDeviceCount)) != NULL)
		{
			ciErrNum = clGetDeviceIDs (NULL, CL_DEVICE_TYPE_ALL, ciDeviceCount, devices, &ciDeviceCount);
			if (ciErrNum == CL_SUCCESS)
			{
				for(i = 0; i < ciDeviceCount; ++i )
				{
					clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(Buffer), &Buffer, NULL);
					printf("Device %d: %s\n", i, Buffer);

				}
			}else{
				printf("clGetDeviceInfo Failed! (return code %i)\n", ciErrNum);
				free_memory();
				exit(1);
			}
		}else{
			printf("Failed to allocate memory!\n");
			free_memory();
			exit(1);
		}
	}





	// Connect to a GPU compute device
	ciErrNum = clGetDeviceIDs(clSelectedPlatformID, CL_DEVICE_TYPE_ALL, ciDeviceCount, devices, NULL);
	if (ciErrNum != CL_SUCCESS){
		printf("Failed to connect to gpu compute device!\n");
		free_memory();
		exit(1);
	}
	printf("connected to gpu compute device\n");


	// Create a compute context
	GPUcontext = clCreateContext(NULL, ciDeviceCount, &devices[0], NULL, NULL, &ciErrNum);
	if(!GPUcontext)
	{
         	printf("Error: Failed to create a compute context! %d\n", ciErrNum);
		free_memory();
         	exit(1);
	}
	printf("created compute context\n" );

	// Create a command queue
	//CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
	commands = clCreateCommandQueue(GPUcontext, devices[0], NULL, &ciErrNum);
	if(!commands)
	{
		
        	printf("Error: Failed to create a command queue! error: %i\n", ciErrNum);
		free_memory();
        	exit(1);
	}
	printf("created command queue done!\n");


/*
	printf("Loading program '%s'...\n", filename);

	source = load_program_source(filename);
	if(!source)
	{
    		printf("Error: Failed to load compute program from file!\n");

		exit(1);
	}
*/
	//CL_MEM_USE_HOST_PTR
	cl_input1 = clCreateBuffer(GPUcontext, CL_MEM_READ_WRITE , sizeof(float)*ARRAY_EL, NULL, NULL);
	//cl_input1 = clCreateBuffer(GPUcontext, CL_MEM_READ_ONLY, sizeof(float)*ARRAY_EL, (void *)input1, NULL);
	if(!cl_input1)
	{
		printf("Error: Failed to allocate input data buffer #1 on device!\n");
		free_memory();
		exit(1);
	}

        cl_input2 = clCreateBuffer(GPUcontext, CL_MEM_READ_WRITE , sizeof(float)*ARRAY_EL, NULL, NULL);
        if(!cl_input2)
        {
                printf("Error: Failed to allocate input data buffer #2 on device!\n");
		free_memory();
                exit(1);
        }



	
	//ciErrNum = clEnqueueReadBuffer(commands, cl_input1, CL_TRUE, 0, sizeof(float) * ARRAY_EL, (void *)input1, 0, NULL, NULL);
	ciErrNum = clEnqueueWriteBuffer(commands, cl_input1, CL_TRUE, 0, sizeof(float)*ARRAY_EL, (void *)input1, 0, NULL, NULL);
	if (ciErrNum != CL_SUCCESS)
	{
		printf("Error: Failed to write to input data buffer #1 on device!\n");
		free_memory();
		exit(1);
	}

        //ciErrNum = clEnqueueReadBuffer(commands, cl_input2, CL_TRUE, 0, sizeof(float) * ARRAY_EL, (void *)input2, 0, NULL, NULL);
	ciErrNum = clEnqueueWriteBuffer(commands, cl_input2, CL_TRUE, 0, sizeof(float)*ARRAY_EL, (void *)input2, 0, NULL, NULL);
        if (ciErrNum != CL_SUCCESS)
        {
                printf("Error: Failed to write to input data buffer #2 on device!\n");
		free_memory();
                exit(1);
        }
	

	cl_output = clCreateBuffer(GPUcontext, CL_MEM_READ_WRITE , sizeof(float)*ARRAY_EL, NULL, NULL);
	if (!cl_output)
	{
		printf("Error: Failed to allocate output buffer on device!\n");
		free_memory();
		exit(1);
	}

        //ciErrNum = clEnqueueWriteBuffer(commands, cl_output, CL_TRUE, 0, sizeof(float)*ARRAY_EL, (void *)output, 0, NULL, NULL);


	ciErrNum = clEnqueueWriteBuffer(commands, cl_output, CL_TRUE, 0, sizeof(float) * ARRAY_EL, (void *)output, 0, NULL, NULL);
        if (ciErrNum != CL_SUCCESS)
        {
                printf("Error: Failed to write to output buffer on device!\n");
		free_memory();
                exit(1);
        }


	printf("all memories allocated and enqueued!\n");
	//printf("...sleeping 3 seconds!\n");
	//sleep(3);

	//free_memory();
/*
	printf("freeing memory objects!\n");

	clReleaseMemObject(cl_input1);
	clReleaseMemObject(cl_input2);
	clReleaseMemObject(cl_output);

	free(input1);
	free(input2);
	free(output);

	printf("all memory freed!\n");
*/



	//cl_program *programs =  (cl_program*)malloc(sizeof(cl_program));
	//cl_program programs;
	//programs =  (cl_program*)malloc(sizeof(cl_program));
	//memset(programs, 0, sizeof(cl_program));
	//cl_kernel *kernels = (cl_kernel*)malloc(sizeof(cl_kernel));
	//cl_kernel kernels;
	//kernels = (cl_kernel*)malloc(sizeof(cl_kernel));
	//memset(kernels, 0, sizeof(cl_kernel));

	const char *source = load_program_source("alpha.cl");
const size_t *lengths;

	//(const char **)

	cl_program programs = clCreateProgramWithSource(GPUcontext, 1, &source, NULL, &ciErrNum);
	if (!programs || ciErrNum != CL_SUCCESS)
	{
		printf("Error: Failed to create compute program!\n");
		free_memory();
		exit(1);
	}

	ciErrNum = clGetProgramInfo (programs,
                  CL_PROGRAM_SOURCE,
                  sizeof(Buffer), &Buffer,
                  NULL);
        if (ciErrNum != CL_SUCCESS)
        {
                printf("Error: Failed to get program info!\n");
                free_memory();
		clReleaseProgram (programs);
                exit(1);
        }else{
		printf("program info: %s\n", Buffer);
	}

//CL_DEVICE_COMPILER_AVAILABLE
clGetDeviceInfo (devices[0], CL_DEVICE_COMPILER_AVAILABLE,sizeof(ciErrNum), &ciErrNum, NULL);

printf("Compiler available: %d\n", ciErrNum);

	ciErrNum = clBuildProgram(programs, 0, NULL, NULL, NULL, NULL);
        if (ciErrNum != CL_SUCCESS)
        {       
		printf("Error: Failed to build program executable! error code: %i\n", ciErrNum);

cl_build_status build_string;
clGetProgramBuildInfo (programs,
                       devices[0],
                       CL_PROGRAM_BUILD_STATUS,
                       sizeof(build_string),
                       &build_string,
                       NULL);
		printf("build status: %d\n", build_string);

clGetProgramBuildInfo (programs,
                       devices[0],
                       CL_PROGRAM_BUILD_LOG,
                       sizeof(Buffer),
                       &Buffer,
                       NULL);
                printf("build log: %s\n", Buffer);






		free_memory();
		clReleaseProgram (programs);
                exit(1);
        }
//CL_PROGRAM_BUILD_LOG       
//CL_PROGRAM_BUILD_STATUS
	clUnloadCompiler();

	cl_kernel kernels = clCreateKernel(programs, "AlphaKernel", &ciErrNum);
	if (!kernels || ciErrNum != CL_SUCCESS)
	{ 
		printf("Error: Failed to create compute kernel! %d\n", ciErrNum);
		free_memory();
		clReleaseProgram (programs);
		exit(1);
	}

	printf("program loaded, build, and kernel created\n");

	ciErrNum = clSetKernelArg (kernels, 0, sizeof(cl_mem)*ARRAY_EL, &cl_input1);
        if (ciErrNum != CL_SUCCESS)   
        {
                printf("Error: Failed to set kernel arg 0!\n");
                free_memory();
                clReleaseProgram (programs);
		clReleaseKernel (kernels);
                exit(1);
        }

	ciErrNum = clSetKernelArg (kernels, 1, sizeof(cl_mem)*ARRAY_EL, &cl_input2);
        if (ciErrNum != CL_SUCCESS)
        {
                printf("Error: Failed to set kernel arg 0!\n");
                free_memory();
                clReleaseProgram (programs);
                clReleaseKernel (kernels);  
                exit(1);
        }

	ciErrNum = clSetKernelArg (kernels, 2, sizeof(cl_mem)*ARRAY_EL, &cl_output);
        if (ciErrNum != CL_SUCCESS)
        {
                printf("Error: Failed to set kernel arg 0!\n");
                free_memory();
                clReleaseProgram (programs);
                clReleaseKernel (kernels);  
                exit(1);
        }

/*
        ciErrNum = clSetKernelArg (kernels, 3, sizeof(int), testing);
        if (ciErrNum != CL_SUCCESS)
        {      
                printf("Error: Failed to set kernel arg 3!\n");
                free_memory();   
                clReleaseProgram (programs);
                clReleaseKernel (kernels);
                exit(1);
        }
*/



	printf("kernel args set\n");


	//size_t work_dim =  (size_t)malloc(sizeof(size_t)*1);
	//size_t local =  (size_t)malloc(sizeof(size_t)*2);
	//size_t global =  (size_t)malloc(sizeof(size_t)*512);
	//size_t *work_item_size = 512;
	//size_t *work_group_size = 512;

size_t global[1];
global[0] = 512;
size_t local[1];
local[0] = 512;

//unsigned long int ele = 0;
//global = 512;

//size_t global = group_counts * work_item_counts;
//size_t local = work_item_counts;

//const size_t *global_work_size
//const size_t *local_work_size

/*
err = CL_SUCCESS;
err |= clEnqueueNDRangeKernel(commands, kernels[i], 1, NULL,
                         &global, &local, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
    printf("Error: Failed to execute kernel!\n");
    return EXIT_FAILURE;
}
*/



ciErrNum = clEnqueueNDRangeKernel (commands, kernels, 1, NULL, global, local, 0, NULL, NULL);
        if (ciErrNum != CL_SUCCESS)
        {
                printf("Error: Failed to execute kernel error: %d\n", ciErrNum);
                free_memory();
                exit(1);
        }



printf("ran kernel\n");
/*
ciErrNum = clEnqueueTask (cl_command_queue command_queue,
               cl_kernel kernel,
               cl_uint num_events_in_wait_list,
               const cl_event *event_wait_list,
               cl_event *event);
*/
//float computed_result[ARRAY_EL];
//computed_result[0] = 222;

        ciErrNum = clEnqueueReadBuffer(commands, cl_output, CL_TRUE, 0, sizeof(float)*ARRAY_EL, (void *)output, 0, NULL, NULL);
        if (ciErrNum != CL_SUCCESS)
        {
                printf("Error: Failed to write to output buffer on device! error: %d\n", ciErrNum);
                free_memory();
                exit(1);
        }

/*
        ciErrNum = clEnqueueReadBuffer(commands, cl_input2, CL_TRUE, 0, sizeof(float)*ARRAY_EL, &computed_result, 0, NULL, NULL);
        if (ciErrNum != CL_SUCCESS)
        {
                printf("Error: Failed to write to output buffer on device! error: %d\n", ciErrNum);
                free_memory();
                exit(1);
        }
*/

//computed_result[0] = 222;
//printf("result: %g\n", computed_result[0]);

	for(i=0; i < 32; i++) {
		//printf("result: %g\n", computed_result[i]);
		printf("output %d: %g\n", i, output[i]);
	}
/*
	for(i=0; i < 5; i++) {
		printf("in1/in2 %d: %g/%g\n", i, input1[i], input2[i]);
	}
*/
        for(i=0; i < 32; i++) {
                printf("in1-in2 %d: %g\n", i, input1[i]-input2[i]);
		//printf("element %d: %g\n", i, input2[i]);
        }


printf("\n\noutput: %g\n", output[509]);
printf("in1-in2: %g\n\n", input1[509]-input2[509]);


printf("\n\noutput: %g\n", output[510]);
printf("in1-in2: %g\n\n", input1[510]-input2[510]);

printf("\n\noutput: %g\n", output[511]);
printf("in1-in2: %g\n\n", input1[511]-input2[511]);




//printf("testing: %d\n", testing);

	printf("freeing memory objects!\n");
	//clUnloadCompiler (void);
	clReleaseKernel (kernels);
	clReleaseProgram (programs);
                
        clReleaseMemObject(cl_input1);
        clReleaseMemObject(cl_input2);
        clReleaseMemObject(cl_output);

        free(input1);
        free(input2);
        free(output);
        
        printf("all memory freed!\n");


printf("done!\n");




	return(0);
}
