#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cstring>
#include <sys/stat.h>
#include <sys/types.h>
#include <CL/cl.h>
#include <unistd.h>
#include <GL/glut.h>
#include <GL/gl.h>
#include <sys/time.h>
#include <math.h>
//#include <pthread.h>


        char                  Buffer[2048];
        char                  Temp[2];
        cl_uint               ciDeviceCount = 1;
        cl_device_id          *devices;
        unsigned int          i, j;   
        cl_device_id          *device_id;
        cl_command_queue      commands;
        cl_context            GPUcontext;
        const char*           filename = "alpha.cl";
        cl_platform_id        *context_platform;
        size_t global[1];
        size_t local[1];
	cl_kernel kernels;
	cl_program programs;
	cl_int ciErrNum;
	cl_platform_id clSelectedPlatformID = NULL;
	cl_build_status build_string;
	unsigned long int ARRAY_EL;
        //global[0] = X*Y;
        //local[0] = 16;

//#include "gl_funcs.h"

void OpenGLInit(void);
static void Animate(void );
static void ResizeWindow(int w, int h);
//void keyboard (unsigned char key, int x, int y);
//void special (int key, int x, int y);
void createGLUTMenus(void);
void processMenuEvents(int option);


struct timespec ts;
struct timeval starttime, endtime, stime1, etime1;

#define RESTART 1
#define PAUSED 2
#define EXIT 3


double fps = 0;
double time1 = 0;
double time2 = 0;

int K, J;
int X;
int Y;
int RES;
int DEPTH = 3;

struct screen {

        unsigned char red;
        unsigned char green;
        unsigned char blue;
        unsigned char alpha;
};               


	struct screen *input1;
	struct screen *input2;
	struct screen *output;

	cl_mem cl_input1;
	cl_mem cl_input2;
	cl_mem cl_output;

static char *
load_program_source(const char *filename)
{
    struct stat statbuf;
    FILE          *fh;
    char          *source;
    fh = fopen(filename, "r");
    if (fh == 0)
         return 0;
    stat(filename, &statbuf);
    source = (char *) malloc(statbuf.st_size + 1);
    fread(source, statbuf.st_size, 1, fh);
    source[statbuf.st_size] = '\0';
    return source;
}

static void
free_memory(void)
{
        printf("freeing memory objects!\n");
         
        clReleaseMemObject(cl_input1);
        clReleaseMemObject(cl_input2);
        clReleaseMemObject(cl_output);

        clReleaseKernel (kernels);
        clReleaseProgram (programs);

        for(int counter2=499; counter2<505; counter2++){
                for(int counter3=499; counter3<505; counter3++){
                        printf("output X: %d Y: %d red:%d\n", counter2, counter3, output[counter2*X+counter3].red);
                }
        }



        
        free(input1);
        free(input2);
        free(output);
                
        printf("all memory freed!\n");
}

//local[0] = 256;


static void Compute_Blend(void) {
        global[0] = X*Y;
        local[0] = 512;  


	//copy slices from public memory to gpu global mem
        ciErrNum = clEnqueueWriteBuffer(commands, cl_input1, CL_TRUE, 0, sizeof(struct screen)*ARRAY_EL, (void *)input1, 0, NULL, NULL);
        if (ciErrNum != CL_SUCCESS)
        {
                printf("Error: Failed to write to input data buffer #1 on device!\n");
                free_memory();
                exit(1);
        }
                
        ciErrNum = clEnqueueWriteBuffer(commands, cl_input2, CL_TRUE, 0, sizeof(struct screen)*ARRAY_EL, (void *)input2, 0, NULL, NULL);
        if (ciErrNum != CL_SUCCESS)
        {
                printf("Error: Failed to write to input data buffer #2 on device!\n");
                free_memory();
                exit(1);
        }


	//run alpha kernel
        ciErrNum = clEnqueueNDRangeKernel (commands, kernels, 1, NULL, global, local, 0, NULL, NULL);
        if (ciErrNum != CL_SUCCESS)
        {
                printf("Error: Failed to execute kernel error: %d\n", ciErrNum);
                free_memory();
                exit(1);
        }
 
        //printf("ran kernel\n");

        
	//copy result slice back to public mem
        ciErrNum = clEnqueueReadBuffer(commands, cl_output, CL_TRUE, 0, sizeof(struct screen)*ARRAY_EL, (void *)output, 0, NULL, NULL);
        if (ciErrNum != CL_SUCCESS)
        {
                printf("Error: Failed to write to output buffer on device! error: %d\n", ciErrNum);
                free_memory();
                exit(1);
        } 

}


#include "gl_funcs.h"

int main(int argc, char** argv) 
{
        glutInit(&argc,argv);
        glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB );
        glutInitWindowPosition( 0, 0 );
        
        Y = glutGet(GLUT_SCREEN_HEIGHT);
        X = glutGet(GLUT_SCREEN_WIDTH);
        RES = X*Y;
        long unsigned int j, counter1, counter2 ,counter3;

	ARRAY_EL = X*Y;

        printf("res: %d x: %d y: %d\n", RES, X, Y);

	input1 = (struct screen *)malloc(ARRAY_EL*sizeof(struct screen) );
        input2 = (struct screen *)malloc(ARRAY_EL*sizeof(struct screen) );
        output = (struct screen *)malloc(ARRAY_EL*sizeof(struct screen) );

	for (i = 0; i < ARRAY_EL; i++)
	{
		input1[i].red = 0;
                input1[i].green = 0;
                input1[i].blue = 0;
                input1[i].alpha = 0;

               	input2[i].red = 0;
                input2[i].green = 0;
                input2[i].blue = 0;
                input2[i].alpha = 0;

               	output[i].red = 0;
                output[i].green = 0;
                output[i].blue = 0;
                output[i].alpha = 0;
	}


	//apply squares
	//int counter1, counter2, counter3;

	for(counter2=0; counter2<Y; counter2++){
		for(counter3=0; counter3<X; counter3++){
			if(counter2>600 && counter2 < 800 && counter3>600 && counter3 < 800){
				input1[counter2*X+counter3].blue=255;
				input1[counter2*X+counter3].alpha=55;
			}
		}
	}

        for(counter2=0; counter2<Y; counter2++){
                for(counter3=0; counter3<X; counter3++){
			if(counter2>500 && counter2 < 700 && counter3>500 && counter3 < 700){
                        	input2[counter2*X+counter3].red=255;
                        	input2[counter2*X+counter3].alpha=85;
			}
                }
        }


        for(counter2=499; counter2<505; counter2++){
                for(counter3=499; counter3<505; counter3++){
			printf("input1 X: %ld Y: %ld red:%d\n", counter2, counter3, input2[counter2*X+counter3].red);
		}
	}
		
        //for(i=498; i < 505; i++) {
        //       	printf("input1 element: %d red:%d\n", i, input1[i].red);
        //}

	//cl_platform_id clSelectedPlatformID = NULL;

	ciErrNum = clGetPlatformIDs (ciDeviceCount, &clSelectedPlatformID, &ciDeviceCount);
	if (ciErrNum != CL_SUCCESS)
	{
		printf("clGetPlatformIDs Failed! (return code %i)\n", ciErrNum);
		free_memory();
		exit(1);
	}
	
	ciErrNum = clGetDeviceIDs (clSelectedPlatformID, CL_DEVICE_TYPE_ALL, 0, NULL, &ciDeviceCount);
	if (ciDeviceCount == 0)
	{
		printf("No devices found supporting OpenCL (return code %i)\n\n", ciErrNum);
		free_memory();
		exit(1);
	}else if (ciErrNum != CL_SUCCESS){
		 printf("Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum);
		free_memory();
		exit(1);
	}else{

	printf("ciDeviceCount: %x\n", ciDeviceCount);

		if ((devices = (cl_device_id*)malloc(sizeof(cl_device_id) * ciDeviceCount)) != NULL)
		{
			ciErrNum = clGetDeviceIDs (NULL, CL_DEVICE_TYPE_ALL, ciDeviceCount, devices, &ciDeviceCount);
			if (ciErrNum == CL_SUCCESS)
			{
				for(i = 0; i < ciDeviceCount; ++i )
				{
					clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(Buffer), &Buffer, NULL);
					printf("Device %d: %s\n", i, Buffer);

				}
			}else{
				printf("clGetDeviceInfo Failed! (return code %i)\n", ciErrNum);
				free_memory();
				exit(1);
			}
		}else{
			printf("Failed to allocate memory!\n");
			free_memory();
			exit(1);
		}
	}

	// Connect to a GPU compute device
	ciErrNum = clGetDeviceIDs(clSelectedPlatformID, CL_DEVICE_TYPE_ALL, ciDeviceCount, devices, NULL);
	if (ciErrNum != CL_SUCCESS){
		printf("Failed to connect to gpu compute device!\n");
		free_memory();
		exit(1);
	}
	printf("connected to gpu compute device\n");

	// Create a compute context
	GPUcontext = clCreateContext(NULL, ciDeviceCount, &devices[0], NULL, NULL, &ciErrNum);
	if(!GPUcontext)
	{
         	printf("Error: Failed to create a compute context! %d\n", ciErrNum);
		free_memory();
         	exit(1);
	}
	printf("created compute context\n" );

	// Create a command queue
	commands = clCreateCommandQueue(GPUcontext, devices[0], NULL, &ciErrNum);
	if(!commands)
	{
		
        	printf("Error: Failed to create a command queue! error: %i\n", ciErrNum);
		free_memory();
        	exit(1);
	}
	printf("created command queue done!\n");

	cl_input1 = clCreateBuffer(GPUcontext, CL_MEM_READ_WRITE , sizeof(struct screen)*ARRAY_EL, NULL, NULL);
	if(!cl_input1)
	{
		printf("Error: Failed to allocate input data buffer #1 on device!\n");
		free_memory();
		exit(1);
	}

        cl_input2 = clCreateBuffer(GPUcontext, CL_MEM_READ_WRITE , sizeof(struct screen)*ARRAY_EL, NULL, NULL);
        if(!cl_input2)
        {
                printf("Error: Failed to allocate input data buffer #2 on device!\n");
		free_memory();
                exit(1);
        }

	ciErrNum = clEnqueueWriteBuffer(commands, cl_input1, CL_TRUE, 0, sizeof(struct screen)*ARRAY_EL, (void *)input1, 0, NULL, NULL);
	if (ciErrNum != CL_SUCCESS)
	{
		printf("Error: Failed to write to input data buffer #1 on device!\n");
		free_memory();
		exit(1);
	}

	ciErrNum = clEnqueueWriteBuffer(commands, cl_input2, CL_TRUE, 0, sizeof(struct screen)*ARRAY_EL, (void *)input2, 0, NULL, NULL);
        if (ciErrNum != CL_SUCCESS)
        {
                printf("Error: Failed to write to input data buffer #2 on device!\n");
		free_memory();
                exit(1);
        }
	

	cl_output = clCreateBuffer(GPUcontext, CL_MEM_READ_WRITE , sizeof(struct screen)*ARRAY_EL, NULL, NULL);
	if (!cl_output)
	{
		printf("Error: Failed to allocate output buffer on device!\n");
		free_memory();
		exit(1);
	}

	ciErrNum = clEnqueueWriteBuffer(commands, cl_output, CL_TRUE, 0, sizeof(struct screen)*ARRAY_EL, (void *)output, 0, NULL, NULL);
        if (ciErrNum != CL_SUCCESS)
        {
                printf("Error: Failed to write to output buffer on device!\n");
		free_memory();
                exit(1);
        }


	printf("all memories allocated and enqueued!\n");

	const char *source = load_program_source("alpha.cl");

	programs = clCreateProgramWithSource(GPUcontext, 1, &source, NULL, &ciErrNum);
	if (!programs || ciErrNum != CL_SUCCESS)
	{
		printf("Error: Failed to create compute program!\n");
		free_memory();
		exit(1);
	}

	ciErrNum = clGetProgramInfo (programs,
                  CL_PROGRAM_SOURCE,
                  sizeof(Buffer), &Buffer,
                  NULL);
        if (ciErrNum != CL_SUCCESS)
        {
                printf("Error: Failed to get program info! error: %d\n", ciErrNum);
                free_memory();
		clReleaseProgram (programs);
                exit(1);
        }else{
		printf("program info: %s\n", Buffer);
	}

	clGetDeviceInfo (devices[0], CL_DEVICE_COMPILER_AVAILABLE,sizeof(ciErrNum), &ciErrNum, NULL);

	printf("Compiler available: %d\n", ciErrNum);

	ciErrNum = clBuildProgram(programs, 0, NULL, NULL, NULL, NULL);
        if (ciErrNum != CL_SUCCESS)
        {       
		printf("Error: Failed to build program executable! error code: %i\n", ciErrNum);

		//cl_build_status build_string;
		clGetProgramBuildInfo (programs,
                       devices[0],
                       CL_PROGRAM_BUILD_STATUS,
                       sizeof(build_string),
                       &build_string,
                       NULL);
		printf("build status: %d\n", build_string);

		clGetProgramBuildInfo (programs,
                       devices[0],
                       CL_PROGRAM_BUILD_LOG,
                       sizeof(Buffer),
                       &Buffer,
                       NULL);
                printf("build log: %s\n", Buffer);

		free_memory();
		clReleaseProgram (programs);
                exit(1);
        }

	clUnloadCompiler();

	kernels = clCreateKernel(programs, "AlphaKernel", &ciErrNum);
	if (!kernels || ciErrNum != CL_SUCCESS)
	{ 
		printf("Error: Failed to create compute kernel! %d\n", ciErrNum);
		free_memory();
		clReleaseProgram (programs);
		exit(1);
	}

	printf("program loaded, build, and kernel created\n");

	ciErrNum = clSetKernelArg (kernels, 0, sizeof(struct screen *), &cl_input1);
        if (ciErrNum != CL_SUCCESS)   
        {
                printf("Error: Failed to set kernel arg 0!\n");
                free_memory();
                clReleaseProgram (programs);
		clReleaseKernel (kernels);
                exit(1);
        }

	ciErrNum = clSetKernelArg (kernels, 1, sizeof(struct screen *), &cl_input2);
        if (ciErrNum != CL_SUCCESS)
        {
                printf("Error: Failed to set kernel arg 0!\n");
                free_memory();
                clReleaseProgram (programs);
                clReleaseKernel (kernels);  
                exit(1);
        }

	ciErrNum = clSetKernelArg (kernels, 2, sizeof(struct screen *), &cl_output);
        if (ciErrNum != CL_SUCCESS)
        {
                printf("Error: Failed to set kernel arg 0!\n");
                free_memory();
                clReleaseProgram (programs);
                clReleaseKernel (kernels);  
                exit(1);
        }

	printf("kernel args set\n");

	//neither global or local can be greater than card supports !

	/*
	1024 global work elements
	64 local workgroups
	16 elements each

	work-group ID 0-63
	local ID 0-15

	work-group-ID * 16 + local-ID
	[1024]

	<HuntsMan> the ND-range is a 1D, 2D or 3D space
	<HuntsMan> which is partitioned into "boxes", which are the local groups
	<HuntsMan> for example, a 1D ND-range, with global size 10 and local size 2, would generate
	<HuntsMan> 10 global work elements
	<HuntsMan> 5 local workgroups with 2 elements each
	<HuntsMan> global and local must be divisible in each dimension BTW
	*/

	/*
        for(i=0; i < 5; i++) {
		//for(j=0; j<2; j++) {
                	printf("elements:%d in1+in2:%d output:%d\n", i, input1[i].red+input2[i].red, output[i].red);
			//printf("element %d: %d\n", i, input2[i]);
		//}
        }

        for(i=1019; i < 1024; i++) {
		printf("elements:%d in1+in2:%d output:%d\n", i, input1[i].red+input2[i].red, output[i].red);
        }
	*/

        glutInitWindowSize( X, Y );
        glutCreateWindow( "lsrender test" );
                        
        OpenGLInit();
        glutReshapeFunc( ResizeWindow );
        glutDisplayFunc( Animate );

        //glutIgnoreKeyRepeat(ignoreRepeats); 
        //glutSpecialFunc(special);
        //glutKeyboardFunc (keyboard);
        createGLUTMenus();

        glutMainLoop(  );

        return(0);
}

