#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <OpenCL/opencl.h>
#include <alloca.h>

#define MAX_GPUDEVICES 2
static _clState *clStates[MAX_GPUDEVICES];
#define CL_SET_BLKARG(blkvar) status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->blkvar)
#define CL_SET_ARG(var) status |= clSetKernelArg(*kernel, num++, sizeof(var), (void *)&var)
#define CL_SET_VARG(args, var) status |= clSetKernelArg(*kernel, num++, args * sizeof(uint), (void *)var)
#define __maybe_unused      __attribute__((unused))
#ifndef _GL_WARN_ON_USE

# if 4 < __GNUC__ || (__GNUC__ == 4 && 3 <= __GNUC_MINOR__)
/* A compiler attribute is available in gcc versions 4.3.0 and later.  */
#  define _GL_WARN_ON_USE(function, message) \
extern __typeof__ (function) function __attribute__ ((__warning__ (message)))
# elif __GNUC__ >= 3 && GNULIB_STRICT_CHECKING
/* Verify the existence of the function.  */
#  define _GL_WARN_ON_USE(function, message) \
extern __typeof__ (function) function
# else /* Unsupported.  */
#  define _GL_WARN_ON_USE(function, message) \
_GL_WARN_EXTERN_C int _gl_warn_on_use
# endif
#endif

struct work {
    unsigned char   data[128];
    unsigned char   midstate[32];
    unsigned char   target[32];
    unsigned char   hash[32];

    unsigned char   device_target[32];
    double      device_diff;
    uint64_t    share_diff;

    int     rolls;
    int     drv_rolllimit; /* How much the driver can roll ntime */

    dev_blk_ctx blk;

    struct thr_info *thr;
    int     thr_id;
    struct pool *pool;
    struct timeval  tv_staged;

    bool        mined;
    bool        clone;
    bool        cloned;
    int     rolltime;
    bool        longpoll;
    bool        stale;
    bool        mandatory;
    bool        block;
    bool        stratum;
    char        *job_id;
    uint64_t    nonce2;
    size_t      nonce2_len;
    char        *ntime;
    double      sdiff;
    char        *nonce1;

    bool        gbt;
    char        *coinbase;
    int     gbt_txns;

    unsigned int    work_block;
    int     id;
    UT_hash_handle  hh;

    double      work_difficulty;

    // Allow devices to identify work if multiple sub-devices
    int     subid;
    // Allow devices to flag work for their own purposes
    bool        devflag;
    // Allow devices to timestamp work for their own purposes
    struct timeval  tv_stamp;

    struct timeval  tv_getwork;
    struct timeval  tv_getwork_reply;
    struct timeval  tv_cloned;
    struct timeval  tv_work_start;
    struct timeval  tv_work_found;
    char        getwork_mode;
};


typedef struct {
    cl_uint ctx_a; cl_uint ctx_b; cl_uint ctx_c; cl_uint ctx_d;
    cl_uint ctx_e; cl_uint ctx_f; cl_uint ctx_g; cl_uint ctx_h;
    cl_uint cty_a; cl_uint cty_b; cl_uint cty_c; cl_uint cty_d;
    cl_uint cty_e; cl_uint cty_f; cl_uint cty_g; cl_uint cty_h;
    cl_uint merkle; cl_uint ntime; cl_uint nbits; cl_uint nonce;
    cl_uint fW0; cl_uint fW1; cl_uint fW2; cl_uint fW3; cl_uint fW15;
    cl_uint fW01r; cl_uint fcty_e; cl_uint fcty_e2;
    cl_uint W16; cl_uint W17; cl_uint W2;
    cl_uint PreVal4; cl_uint T1;
    cl_uint C1addK5; cl_uint D1A; cl_uint W2A; cl_uint W17_2;
    cl_uint PreVal4addT1; cl_uint T1substate0;
    cl_uint PreVal4_2;
    cl_uint PreVal0;
    cl_uint PreW18;
    cl_uint PreW19;
    cl_uint PreW31;
    cl_uint PreW32;

    /* FIXME: remove (For diakgcn) */
    cl_uint B1addK6, PreVal0addK7, W16addK16, W17addK17;
    cl_uint zeroA, zeroB;
    cl_uint oneA, twoA, threeA, fourA, fiveA, sixA, sevenA;

    struct work *work;
} dev_blk_ctx;


struct thr_info {
    int     id;
    int     device_thread;

    pthread_t   pth;
    cgsem_t     sem;
    struct thread_q *q;
    struct cgpu_info *cgpu;
    void *cgpu_data;
    struct timeval last;
    struct timeval sick;

    bool    pause;
    bool    paused;
    bool    getwork;
    double  rolling;

    bool    work_restart;
    bool    work_update;
};

typedef struct {
    cl_context context;
    cl_kernel kernel;
    cl_command_queue commandQueue;
    cl_program program;
    cl_mem outputBuffer;
    cl_mem CLbuffer0;
    cl_mem padbuffer8;
    //cl_mem midstate;
    //cl_mem gbuff;
    //cl_mem lbuff;
    size_t padbufsize;
    void * cldata;
    void * clmidstate;
    bool hasBitAlign;
    bool hasOpenCL11plus;
    bool hasOpenCL12plus;
    bool goffset;
    cl_uint vwidth;
    size_t max_work_size;
    size_t wsize;
    size_t compute_shaders;
} _clState;


//static cl_int queue_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
//{

static cl_int queue_kernel(_clState *clState,  dev_blk_ctx *blk, __maybe_unused cl_uint threads){
    unsigned char *midstate = blk->work->midstate;
    cl_kernel *kernel = &clState->kernel;
    unsigned int num = 0;
    cl_uint le_target;
    cl_int status = 0;
    le_target = *(cl_uint *)(blk->work->device_target + 28);

    clState->cldata = blk->work->data;
    status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);

    CL_SET_ARG(clState->CLbuffer0);
    CL_SET_ARG(clState->outputBuffer);
    CL_SET_ARG(clState->padbuffer8);
    CL_SET_VARG(4, &midstate[0]);
    CL_SET_VARG(4, &midstate[16]);
    CL_SET_ARG(le_target);

    return status;
}

static void opencl_thread_shutdown(struct thr_info *thr)
{
    const int thr_id = thr->id;
    _clState *clState = clStates[thr_id];
    clStates[thr_id] = NULL;

    if (clState) {
        clFinish(clState->commandQueue);
        clReleaseMemObject(clState->outputBuffer);
        clReleaseMemObject(clState->CLbuffer0);
        clReleaseMemObject(clState->padbuffer8);
        //clReleaseMemObject(clState->midstate);
        //clReleaseMemObject(clState->gbuff);
        //clReleaseMemObject(clState->lbuff);
        clReleaseKernel(clState->kernel);
        clReleaseProgram(clState->program);
        clReleaseCommandQueue(clState->commandQueue);
        clReleaseContext(clState->context);
        free(clState);
    }
}


_clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *algorithm)
{

	_clState *clState = (_clState *)calloc(1, sizeof(_clState));
	struct cgpu_info *cgpu = &gpus[gpu];
	cl_platform_id platform = NULL;
    char pbuff[256], vbuff[255];
    cl_platform_id* platforms;
    cl_uint preferred_vwidth;
    cl_device_id *devices;
    cl_uint numPlatforms;
    cl_uint numDevices;
    cl_int status;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d: Getting Platforms. (clGetPlatformsIDs)", status);
        return NULL;
    }

    platforms = (cl_platform_id *)alloca(numPlatforms*sizeof(cl_platform_id));
    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
    if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d: Getting Platform Ids. (clGetPlatformsIDs)", status);
        return NULL;
    }

    if (opt_platform_id >= (int)numPlatforms) {
        applog(LOG_ERR, "Specified platform that does not exist");
        return NULL;
    }

    status = clGetPlatformInfo(platforms[opt_platform_id], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL);
    if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d: Getting Platform Info. (clGetPlatformInfo)", status);
        return NULL;
    }
    platform = platforms[opt_platform_id];

    if (platform == NULL) {
        perror("NULL platform found!\n");
        return NULL;
    }
    applog(LOG_INFO, "CL Platform vendor: %s", pbuff);
    status = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pbuff), pbuff, NULL);
    if (status == CL_SUCCESS)
        applog(LOG_INFO, "CL Platform name: %s", pbuff);
    status = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(vbuff), vbuff, NULL);
    if (status == CL_SUCCESS)
        applog(LOG_INFO, "CL Platform version: %s", vbuff);

    status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
    if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d: Getting Device IDs (num)", status);
        return NULL;
    }

    if (numDevices > 0 ) {
        devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id));

        /* Now, get the device list data */

        status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
        if (status != CL_SUCCESS) {
            applog(LOG_ERR, "Error %d: Getting Device IDs (list)", status);
            return NULL;
        }

        applog(LOG_INFO, "List of devices:");

        unsigned int i;
        for (i = 0; i < numDevices; i++) {
            status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
            if (status != CL_SUCCESS) {
                applog(LOG_ERR, "Error %d: Getting Device Info", status);
                return NULL;
            }
            applog(LOG_INFO, "\t%i\t%s", i, pbuff);
        }

        if (gpu < numDevices) {
            status = clGetDeviceInfo(devices[gpu], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
            if (status != CL_SUCCESS) {
                applog(LOG_ERR, "Error %d: Getting Device Info", status);
                return NULL;
            }

            applog(LOG_INFO, "Selected %i: %s", gpu, pbuff);
            strncpy(name, pbuff, nameSize);
        } else {
            applog(LOG_ERR, "Invalid GPU %i", gpu);
            return NULL;
        }

    } else return NULL;

    cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };

    clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status);
    if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d: Creating Context. (clCreateContextFromType)", status);
        return NULL;
    }

    /////////////////////////////////////////////////////////////////
    // Create an OpenCL command queue
    /////////////////////////////////////////////////////////////////
    clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu],
                             CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status);
    if (status != CL_SUCCESS) /* Try again without OOE enable */
        clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status);
    if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status);
        return NULL;
    }

    status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&preferred_vwidth, NULL);
    if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT", status);
        return NULL;
    }
    applog(LOG_DEBUG, "Preferred vector width reported %d", preferred_vwidth);

    status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&clState->max_work_size, NULL);
    if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_WORK_GROUP_SIZE", status);
        return NULL;
    }
    applog(LOG_DEBUG, "Max work group size reported %d", (int)(clState->max_work_size));

    size_t compute_units = 0;
    status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), (void *)&compute_units, NULL);
    if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_COMPUTE_UNITS", status);
        return NULL;
    }
    clState->compute_shaders = compute_units * 64;
    applog(LOG_DEBUG, "Max shaders calculated %d", (int)(clState->compute_shaders));

    status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), (void *)&cgpu->max_alloc, NULL);
    if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_MEM_ALLOC_SIZE", status);
        return NULL;
    }
    applog(LOG_DEBUG, "Max mem alloc size is %lu", (long unsigned int)(cgpu->max_alloc));

    char binaryfilename[255];
    char filename[255];
    char strbuf[32];

    if (cgpu->kernelname == NULL) {
        applog(LOG_INFO, "No kernel specified, defaulting to ckolivas");
        cgpu->kernelname = strdup("ckolivas");
    }

    sprintf(strbuf, "%s.cl", cgpu->kernelname);
    strcpy(filename, strbuf);
    strcpy(binaryfilename, cgpu->kernelname);
    /* All available kernels only support vector 1 */
    cgpu->vwidth = 1;

    /* Vectors are hard-set to 1 above. */
    if (likely(cgpu->vwidth))
        clState->vwidth = cgpu->vwidth;
    else {
        clState->vwidth = preferred_vwidth;
        cgpu->vwidth = preferred_vwidth;
    }

    clState->goffset = true;

    if (cgpu->work_size && cgpu->work_size <= clState->max_work_size)
        clState->wsize = cgpu->work_size;
    else
        clState->wsize = 256;

    if (!cgpu->opt_lg) {
        applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu);
        cgpu->lookup_gap = 2;
    } else
        cgpu->lookup_gap = cgpu->opt_lg;
    if ((strcmp(cgpu->kernelname, "zuikkis") == 0) && (cgpu->lookup_gap != 2)) {
        applog(LOG_WARNING, "Kernel zuikkis only supports lookup-gap = 2 (currently %d), forcing.", cgpu->lookup_gap);
        cgpu->lookup_gap = 2;
    }

    if ((strcmp(cgpu->kernelname, "lsoc") == 0) && (cgpu->lookup_gap > 8)) {
        applog(LOG_WARNING, "Kernel lsoc only supports lookup-gap 1 to 8 (currently %d), forcing 8.", cgpu->lookup_gap);
        cgpu->lookup_gap = 8;
    }

    if (!cgpu->opt_tc) {
        unsigned int sixtyfours;

        sixtyfours =  cgpu->max_alloc / 131072 / 64 / (algorithm->n/1024) - 1;
        cgpu->thread_concurrency = sixtyfours * 64;
        if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) {
            cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders;
            if (cgpu->thread_concurrency > cgpu->shaders * 5)
                cgpu->thread_concurrency = cgpu->shaders * 5;
        }
        applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %d", gpu, (int)(cgpu->thread_concurrency));
    } else
        cgpu->thread_concurrency = cgpu->opt_tc;


    FILE *binaryfile;
    size_t *binary_sizes;
    char **binaries;
    int pl;
    char *source = file_contents(filename, &pl);
    size_t sourceSize[] = {(size_t)pl};
    cl_uint slot, cpnd;

    slot = cpnd = 0;
    if (!source)
        return NULL;

    binary_sizes = (size_t *)calloc(sizeof(size_t) * MAX_GPUDEVICES * 4, 1);
    if (unlikely(!binary_sizes)) {
        applog(LOG_ERR, "Unable to calloc binary_sizes");
        return NULL;
    }
    binaries = (char **)calloc(sizeof(char *) * MAX_GPUDEVICES * 4, 1);
    if (unlikely(!binaries)) {
        applog(LOG_ERR, "Unable to calloc binaries");
        return NULL;
    }

    strcat(binaryfilename, name);
    if (clState->goffset)
        strcat(binaryfilename, "g");

    sprintf(strbuf, "lg%utc%unf%u", cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency, algorithm->nfactor);
    strcat(binaryfilename, strbuf);

    sprintf(strbuf, "w%d", (int)clState->wsize);
    strcat(binaryfilename, strbuf);
    sprintf(strbuf, "l%d", (int)sizeof(long));
    strcat(binaryfilename, strbuf);
    strcat(binaryfilename, ".bin");

    binaryfile = fopen(binaryfilename, "rb");
    if (!binaryfile) {
        applog(LOG_DEBUG, "No binary found, generating from source");
    } else {
        struct stat binary_stat;
        if (unlikely(stat(binaryfilename, &binary_stat))) {
            applog(LOG_DEBUG, "Unable to stat binary, generating from source");
            fclose(binaryfile);
            goto build;
        }
        if (!binary_stat.st_size)
            goto build;

        binary_sizes[slot] = binary_stat.st_size;
        binaries[slot] = (char *)calloc(binary_sizes[slot], 1);
        if (unlikely(!binaries[slot])) {
            applog(LOG_ERR, "Unable to calloc binaries");
            fclose(binaryfile);
            return NULL;
        }

        if (fread(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot]) {
            applog(LOG_ERR, "Unable to fread binaries");
            fclose(binaryfile);
            free(binaries[slot]);
            goto build;
        }

        clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)binaries, NULL, &status);
        if (status != CL_SUCCESS) {
            applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status);
            fclose(binaryfile);
            free(binaries[slot]);
            goto build;
        }

        fclose(binaryfile);
        applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename);

        goto built;
    }
    /////////////////////////////////////////////////////////////////
    // Load CL file, build CL program object, create CL kernel object
    /////////////////////////////////////////////////////////////////

build:
    applog(LOG_NOTICE, "Building binary %s", binaryfilename);

    clState->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status);
    if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithSource)", status);
        return NULL;
    }

    /* create a cl program executable for all the devices specified */
    char *CompilerOptions = (char *)calloc(1, 256);

    sprintf(CompilerOptions, "-I kernel/ -D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d -D NFACTOR=%d",
            cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency, (int)clState->wsize, (unsigned int)algorithm->nfactor);

    applog(LOG_DEBUG, "Setting worksize to %d", (int)(clState->wsize));
    if (clState->vwidth > 1)
        applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->vwidth);

    if (clState->hasBitAlign) {
        strcat(CompilerOptions, " -D BITALIGN");
        applog(LOG_DEBUG, "cl_amd_media_ops found, setting BITALIGN");
        if (!clState->hasOpenCL12plus &&
            (strstr(name, "Cedar") ||
             strstr(name, "Redwood") ||
             strstr(name, "Juniper") ||
             strstr(name, "Cypress" ) ||
             strstr(name, "Hemlock" ) ||
             strstr(name, "Caicos" ) ||
             strstr(name, "Turks" ) ||
             strstr(name, "Barts" ) ||
             strstr(name, "Cayman" ) ||
             strstr(name, "Antilles" ) ||
             strstr(name, "Wrestler" ) ||
             strstr(name, "Zacate" ) ||
             strstr(name, "WinterPark" )))
            patchbfi = true;
    } else
        applog(LOG_DEBUG, "cl_amd_media_ops not found, will not set BITALIGN");

    if (patchbfi) {
        strcat(CompilerOptions, " -D BFI_INT");
        applog(LOG_DEBUG, "BFI_INT patch requiring device found, patched source with BFI_INT");
    } else
        applog(LOG_DEBUG, "BFI_INT patch requiring device not found, will not BFI_INT patch");

    if (clState->goffset)
        strcat(CompilerOptions, " -D GOFFSET");

    if (!clState->hasOpenCL11plus)
        strcat(CompilerOptions, " -D OCL1");

    applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions);
    status = clBuildProgram(clState->program, 1, &devices[gpu], CompilerOptions , NULL, NULL);
    free(CompilerOptions);

    if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status);
        size_t logSize;
        status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
        char *log = (char *)malloc(logSize);
        status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
        applog(LOG_ERR, "%s", log);
        return NULL;
    }

    prog_built = true;

#ifdef __APPLE__
    /* OSX OpenCL breaks reading off binaries with >1 GPU so always build
     * from source. */
    goto built;
#endif

    status = clGetProgramInfo(clState->program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &cpnd, NULL);
    if (unlikely(status != CL_SUCCESS)) {
        applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_NUM_DEVICES. (clGetProgramInfo)", status);
        return NULL;
    }

    status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*cpnd, binary_sizes, NULL);
    if (unlikely(status != CL_SUCCESS)) {
        applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetProgramInfo)", status);
        return NULL;
    }

    /* The actual compiled binary ends up in a RANDOM slot! Grr, so we have
     * to iterate over all the binary slots and find where the real program
     * is. What the heck is this!? */
    for (slot = 0; slot < cpnd; slot++)
        if (binary_sizes[slot])
            break;

    /* copy over all of the generated binaries. */
    applog(LOG_DEBUG, "Binary size for gpu %d found in binary slot %d: %d", gpu, slot, (int)(binary_sizes[slot]));
    if (!binary_sizes[slot]) {
        applog(LOG_ERR, "OpenCL compiler generated a zero sized binary, FAIL!");
        return NULL;
    }
    binaries[slot] = (char *)calloc(sizeof(char)* binary_sizes[slot], 1);
    status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARIES, sizeof(char *) * cpnd, binaries, NULL );
    if (unlikely(status != CL_SUCCESS)) {
        applog(LOG_ERR, "Error %d: Getting program info. CL_PROGRAM_BINARIES (clGetProgramInfo)", status);
        return NULL;
    }

    /* Patch the kernel if the hardware supports BFI_INT but it needs to
     * be hacked in */
    if (patchbfi) {
        unsigned remaining = binary_sizes[slot];
        char *w = binaries[slot];
        unsigned int start, length;

        /* Find 2nd incidence of .text, and copy the program's
        * position and length at a fixed offset from that. Then go
        * back and find the 2nd incidence of \x7ELF (rewind by one
        * from ELF) and then patch the opcocdes */
        if (!advance(&w, &remaining, ".text"))
            goto build;
        w++; remaining--;
        if (!advance(&w, &remaining, ".text")) {
            /* 32 bit builds only one ELF */
            w--; remaining++;
        }
        memcpy(&start, w + 285, 4);
        memcpy(&length, w + 289, 4);
        w = binaries[slot]; remaining = binary_sizes[slot];
        if (!advance(&w, &remaining, "ELF"))
            goto build;
        w++; remaining--;
        if (!advance(&w, &remaining, "ELF")) {
            /* 32 bit builds only one ELF */
            w--; remaining++;
        }
        w--; remaining++;
        w += start; remaining -= start;
        applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching",
            w, remaining);
        patch_opcodes(w, length);

        status = clReleaseProgram(clState->program);
        if (status != CL_SUCCESS) {
            applog(LOG_ERR, "Error %d: Releasing program. (clReleaseProgram)", status);
            return NULL;
        }

        clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)&binaries[slot], NULL, &status);
        if (status != CL_SUCCESS) {
            applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status);
            return NULL;
        }

        /* Program needs to be rebuilt */
        prog_built = false;
    }

    free(source);

    /* Save the binary to be loaded next time */
    binaryfile = fopen(binaryfilename, "wb");
    if (!binaryfile) {
        /* Not fatal, just means we build it again next time */
        applog(LOG_DEBUG, "Unable to create file %s", binaryfilename);
    } else {
        if (unlikely(fwrite(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot])) {
            applog(LOG_ERR, "Unable to fwrite to binaryfile");
            return NULL;
        }
        fclose(binaryfile);
    }
built:
    if (binaries[slot])
        free(binaries[slot]);
    free(binaries);
    free(binary_sizes);

    applog(LOG_NOTICE, "Initialising kernel %s with%s bitalign, %spatched BFI, nfactor %d, n %d",
           filename, clState->hasBitAlign ? "" : "out", patchbfi ? "" : "un",
           algorithm->nfactor, algorithm->n);

    if (!prog_built) {
        /* create a cl program executable for all the devices specified */
        status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL);
        if (status != CL_SUCCESS) {
            applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status);
            size_t logSize;
            status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);

            char *log = (char *)malloc(logSize);
            status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
            applog(LOG_ERR, "%s", log);
            return NULL;
        }
    }

    /* get a kernel object handle for a kernel with the given name */
    clState->kernel = clCreateKernel(clState->program, "search", &status);
    if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d: Creating Kernel from program. (clCreateKernel)", status);
        return NULL;
    }
    size_t ipt = (algorithm->n / cgpu->lookup_gap +
              (algorithm->n % cgpu->lookup_gap > 0));
    size_t bufsize = 128 * ipt * cgpu->thread_concurrency;
    /* Use the max alloc value which has been rounded to a power of
     * 2 greater >= required amount earlier */
    if (bufsize > cgpu->max_alloc) {
        applog(LOG_WARNING, "Maximum buffer memory device %d supports says %lu",
               gpu, (unsigned long)(cgpu->max_alloc));
        applog(LOG_WARNING, "Your scrypt settings come to %lu", (unsigned long)bufsize);
    }
    applog(LOG_DEBUG, "Creating scrypt buffer sized %lu", (unsigned long)bufsize);
    clState->padbufsize = bufsize;

    /* This buffer is weird and might work to some degree even if
     * the create buffer call has apparently failed, so check if we
     * get anything back before we call it a failure. */
    clState->padbuffer8 = NULL;
    clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status);
    if (status != CL_SUCCESS && !clState->padbuffer8) {
        applog(LOG_ERR, "Error %d: clCreateBuffer (padbuffer8), decrease TC or increase LG", status);
        return NULL;
    }

    clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status);
    if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status);
        return NULL;
    }
    clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status);
    if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status);
        return NULL;
    }

    return clState;
}


int main(int argc, char **argv){


	return 0;
}