/* Copyright (c) 2010 - 2021 Advanced Micro Devices, Inc.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE. */

#include "OCLPerfSepia.h"

#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <time.h>

#define WIDTH 1024
#define HEIGHT 1024

// Quiet pesky warnings
#ifdef WIN_OS
#define SNPRINTF sprintf_s
#else
#define SNPRINTF snprintf
#endif

#define MAX(a, b) (a > b ? a : b)

const char *sepiaVertexProgram =
    "!!ARBvp1.0\n"
    "\n"
    "\n"
    "OPTION ARB_position_invariant;\n"
    "\n"
    "PARAM p0 = program.local[2];\n"
    "PARAM p1 = program.local[3];\n"
    "ATTRIB a0 = vertex.texcoord[0];\n"
    "OUTPUT o0 = result.texcoord[0];\n"
    "OUTPUT o1 = result.texcoord[1];\n"
    "TEMP r0, r1;\n"
    "\n"
    "MOV o0, a0;\n"
    "#SWZ r1, a0, x, y, 0, 0;\n"
    "#DPH r0.x, r1, p0;\n"
    "#DPH r0.y, r1, p1;\n"
    "#MOV o1, r0;\n"
    "MOV o1, a0;\n"
    "\n"
    "END\n";

const char *sepiaFragmentProgram =
    "!!ARBfp1.0\n"
    "\n"
    "\n"
    "PARAM p0 = {1e-4, 0.085, 0.0, 0.0};\n"
    "PARAM p1 = {0.2125, 0.7154, 0.0721, 0.0};\n"
    "PARAM p2 = {-3605.984, 0.1323156, 0.0, -0.1991615};\n"
    "PARAM p3 = {708.7939, -0.3903106, -0.05854013, 0.6621023};\n"
    "PARAM p4 = {-50.93341, 0.4654831, 1.027555, -0.9069088};\n"
    "PARAM p5 = {3.116672, 0.7926372, 0.03219686, 1.411847};\n"
    "PARAM p6 = {8.95663e-4, -0.001104567, -6.0827e-4, 0.03277428};\n"
    "PARAM p7 = program.local[0];\n"
    "PARAM p8 = program.local[1];\n"
    "ATTRIB a0 = fragment.texcoord[1];\n"
    "OUTPUT o0 = result.color;\n"
    "TEMP r0, r1, r2, r3;\n"
    "\n"
    "TEX r1, a0, texture[0], RECT;\n"
    "#MAX r0, p0.x, r1.w;\n"
    "#RCP r2, r0.x;\n"
    "#DP3 r3, r1, p1;\n"
    "#MUL r0, r3, r2;\n"
    "#MAD r2, r0, p2, p3;\n"
    "#MAD r2, r2, r0, p4;\n"
    "#MAD r0, r2, r0, p5;\n"
    "#MUL r2, r1.w, p6;\n"
    "#MAD r2, r0, r3, r2;\n"
    "#MAD r0, r1.w, p0.y, -r3;\n"
    "#CMP r2.x, -r0, r2.x, r2.w;\n"
    "#MAD r0, r3, r3, -r3;\n"
    "#CMP r0, r0.x, r2, r3;\n"
    "#MOV r0.w, r1;\n"
    "#MUL r0, r0, p7;\n"
    "#LRP o0, p8.x, r0, r1;\n"
    "MOV o0, r1;\n"
    "\n"
    "END\n";

const static char *strKernel =
    "\n"
    "__kernel void program(write_only image2d_t dest, int flipped, int4 dim, "
    "float2 st_origin, float4 st_delta, float4 l0, float4 l1, float4 l2, "
    "float4 l3, read_only image2d_t t0, sampler_t t_sampler0)\n"
    "{\n"
    "      const sampler_t sam = CLK_NORMALIZED_COORDS_FALSE | "
    "CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
    "//    const float4 p0  = (float4)( 0x1.b33334p-3, 0x1.6e48e8p-1, "
    "0x1.275254p-4, 0x0p+0 );\n"
    "//    const float4 p1  = (float4)( 0x1.a36e2ep-14, 0x1.5c28f6p-4, 0x0p+0, "
    "0x0p+0 );\n"
    "//    const float4 p2  = (float4)( 0x1.d595dap-11, -0x1.218e3cp-10, "
    "-0x1.3ee89ep-11, 0x1.0c7ca6p-5 );\n"
    "//    const float4 p3  = (float4)( -0x1.c2bf7cp+11, 0x1.0efb7cp-3, "
    "0x0p+0, -0x1.97e1fcp-3 );\n"
    "//    const float4 p4  = (float4)( 0x1.62659ep+9, -0x1.8fad94p-2, "
    "-0x1.df8f8cp-5, 0x1.52ff12p-1 );\n"
    "//   const float4 p5  = (float4)( -0x1.9777ap+5, 0x1.dca79ap-2, "
    "0x1.070dd8p+0, -0x1.d0565ap-1 );\n"
    "//    const float4 p6  = (float4)( 0x1.8eef1cp+1, 0x1.95d48cp-1, "
    "0x1.07c1b6p-5, 0x1.696ecep+0 );\n"
    "//    int          dest_width = dim.x;\n"
    "//    int          dest_height = dim.y;\n"
    "    float4       o0, r0, r1, r2, r3, r4;\n"
    "//    float4       false_vector = (float4) 0.0f;\n"
    "//    float4       true_vector = (float4) 1.0f;\n"
    "    int2         loc = (int2)( get_global_id(0), get_global_id(1) );\n"
    "//    if ((loc.x >= dim.x) || loc.y >= dim.y) return;\n"
    "//    float4 f0 = (float4)( st_origin.x + ((float)loc.x + 0.5f) * "
    "st_delta.x + ((float)loc.y + 0.5f) * st_delta.z, st_origin.y + "
    "((float)loc.x + 0.5f) * st_delta.y + ((float)loc.y + 0.5f) * st_delta.w, "
    "0.0f, 0.0f );\n"
    "//    r2 = f0;\n"
    "//    r0.x = dot(r2.xy,l2.xy) + l2.w;\n"
    "//    r0.y = dot(r2.xy,l3.xy) + l3.w;\n"
    "//    r4 = r0;\n"
    "    r1 = read_imagef(t0, sam/*t_sampler0*/, r4.xy);\n"
    "//    r3 = dot(r1.xyz,p0.xyz);\n"
    "//    r2 = max(p1.xxxx, r1.wwww);\n"
    "//    r0 = native_recip(r2.xxxx);\n"
    "//    r4 = r3*r0;\n"
    "//    r2 = r1.wwww*p2;\n"
    "//    r0 = mad(r4,p3,p4);\n"
    "//    r0 = mad(r0,r4,p5);\n"
    "//    r0 = mad(r0,r4,p6);\n"
    "//    r2 = mad(r0,r3,r2);\n"
    "//    r0 = mad(r1.wwww,p1.yyyy,-r3);\n"
    "//    r2.x = select(r2.w,r2.x, isless(-r0.x, 0.0f));\n"
    "//    r0 = mad(r3,r3,-r3);\n"
    "//    r0 = select(r3,r2, isless(r0.xxxx, 0.0f));\n"
    "//    r0.w = r1.w;\n"
    "//    r0 = r0*l0;\n"
    "//    r0 = mix(r1,r0, l1.xxxx);\n"
    "//    r0.xyz = min(r0.xyz, r0.www);\n"
    "//    o0 = r0;\n"
    "    write_imagef(dest, loc /*(int2)( loc.x + dim.z , flipped ? "
    "get_image_height(dest) - (loc.y + dim.w + 1) : loc.y + dim.w )*/, r1 "
    "/*o0*/);\n"
    "}\n";

OCLPerfSepia::OCLPerfSepia() { _numSubTests = 2; }

OCLPerfSepia::~OCLPerfSepia() {}

void OCLPerfSepia::open(unsigned int test, char *units, double &conversion,
                        unsigned int deviceId) {
  bVerify_ = false;
  silentFailure_ = false;
  iterations_ = 50000;
  bpr_ = 0;
  data_ = 0;
  result_ = 0;
  width_ = 0;
  height_ = 0;
  _crcword = 0;
  conversion = 1.0f;
  _deviceId = deviceId;
  _openTest = test;
  texId = 0;
  format_.image_channel_order = CL_RGBA;
  format_.image_channel_data_type = CL_UNORM_INT8;

  srand(0x8956);  // some constant instead of time() so that we get same random
                  // numbers

  if (!IsGLEnabled(test, units, conversion, deviceId)) {
    silentFailure_ = true;
    return;
  }
  OCLGLCommon::open(test, units, conversion, deviceId);
  if (_errorFlag) return;
  if (test == 0) {
    // Build the kernel
    program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel,
                                                   NULL, &error_);
    CHECK_RESULT((error_ != CL_SUCCESS),
                 "clCreateProgramWithSource()  failed (%d)", error_);
    const char *optionsGPU = "-cl-denorms-are-zero -cl-mad-enable";
    error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId],
                                      optionsGPU, NULL, NULL);
    if (error_ != CL_SUCCESS) {
      char programLog[1024];
      _wrapper->clGetProgramBuildInfo(program_, devices_[deviceId],
                                      CL_PROGRAM_BUILD_LOG, 1024, programLog,
                                      0);
      printf("\n%s\n", programLog);
      fflush(stdout);
    }
    CHECK_RESULT((error_ != CL_SUCCESS), "clBuildProgram() failed (%d)",
                 error_);

    kernel_ = _wrapper->clCreateKernel(program_, "program", &error_);
    CHECK_RESULT((error_ != CL_SUCCESS), "clCreateKernel() failed (%d)",
                 error_);
  }
}

void OCLPerfSepia::populateData(void) {
  width_ = WIDTH;
  height_ = HEIGHT;
  bpr_ = 4 * width_;
  data_ = (cl_uchar *)malloc(height_ * bpr_);
  for (unsigned int n = 0; n < (height_ * bpr_); n++) {
    data_[n] = (n & 3) ? (rand() % 256) : 0xFF;
  }
}

void OCLPerfSepia::runGL(void) {
  glDisable(GL_ALPHA_TEST);
  glDisable(GL_DEPTH_TEST);
  glDisable(GL_SCISSOR_TEST);
  glDisable(GL_BLEND);
  glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
  glDisable(GL_DITHER);
  glDisable(GL_CULL_FACE);
  glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
  glDepthMask(GL_FALSE);
  glStencilMask(0);

  glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);

  // let's create the textures we need

  glEnable(GL_TEXTURE_RECTANGLE_EXT);
  glGenTextures(1, &texId);
  glBindTexture(GL_TEXTURE_RECTANGLE_EXT, texId);

  // have GL alloc memory for us for our destination texture which we will be
  // rendering into
  glTexImage2D(GL_TEXTURE_RECTANGLE_EXT, 0, GL_RGBA, width_, height_, 0,
               GL_BGRA /*RGBA*/, GL_UNSIGNED_INT_8_8_8_8_REV, NULL);
  glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
  glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, GL_NEAREST);

  // for the source texture we will provide a data ptr and hang on to it
  GLuint srcTexture;

  glGenTextures(1, &srcTexture);
  glBindTexture(GL_TEXTURE_RECTANGLE_EXT, srcTexture);

  glPixelStorei(GL_UNPACK_ROW_LENGTH, width_);
  glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, height_);
  glPixelStorei(GL_UNPACK_ALIGNMENT, 8);

  // XXX Alex -- use optimal texture upload format.
  glTexImage2D(GL_TEXTURE_RECTANGLE_EXT, 0, GL_RGBA, width_, height_, 0,
               GL_BGRA, /* GL_RGBA,*/
               format_.image_channel_order == CL_RGBA
                   ? GL_UNSIGNED_INT_8_8_8_8
                   : GL_UNSIGNED_INT_8_8_8_8_REV,
               data_);

  glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
  glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
  glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_S,
                  GL_CLAMP_TO_EDGE);
  glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_T,
                  GL_CLAMP_TO_EDGE);
  glPixelStorei(GL_UNPACK_SWAP_BYTES, 0);
  glPixelStorei(GL_UNPACK_LSB_FIRST, 0);
  glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
  glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, 0);
  glPixelStorei(GL_UNPACK_SKIP_PIXELS, 0);
  glPixelStorei(GL_UNPACK_SKIP_IMAGES, 0);
  glPixelStorei(GL_UNPACK_SKIP_ROWS, 0);
  glPixelStorei(GL_UNPACK_ALIGNMENT, 4);

  GLuint vertexProgram;
  GLuint fragmentProgram;

  glGenProgramsARB(1, &vertexProgram);
  glGenProgramsARB(1, &fragmentProgram);

  glBindProgramARB(GL_VERTEX_PROGRAM_ARB, vertexProgram);
  glProgramStringARB(GL_VERTEX_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
                     (GLsizei)strlen(sepiaVertexProgram), sepiaVertexProgram);

  glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, fragmentProgram);
  glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
                     (GLsizei)strlen(sepiaFragmentProgram),
                     sepiaFragmentProgram);

  GLfloat l0[] = {1.0f, 0.99f, 0.92f, 1.0f};
  GLfloat l1[] = {0.5, 0, 0, 0};
  GLfloat l2[] = {1, 0, 0, 0};
  GLfloat l3[] = {0, -1, 0, (GLfloat)height_};

  glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, 0, l0);
  glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, 1, l1);
  glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, 2, l2);
  glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, 3, l3);

  glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 0, l0);
  glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 1, l1);
  glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 2, l2);
  glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 3, l3);

  GLuint fbo;

  glGenFramebuffersEXT(1, &fbo);

  glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fbo);

  glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT,
                            GL_TEXTURE_RECTANGLE_ARB, texId, 0);
  glViewport(0, 0, width_, height_);
  glMatrixMode(GL_PROJECTION);
  glLoadIdentity();
  glOrtho(0, width_, 0, height_, -1, 1);
  glClearColor(0, 0, 0, 0);
  glClear(GL_COLOR_BUFFER_BIT);
  glDisable(GL_BLEND);

  glEnable(GL_VERTEX_PROGRAM_ARB);
  glEnable(GL_FRAGMENT_PROGRAM_ARB);

  // warm up
  for (unsigned int k = 0; k < (iterations_ / 10); k++) {
    glBegin(GL_QUADS);
    glTexCoord2f(0, 0);
    glVertex2f(0, (GLfloat)height_);
    glTexCoord2f((GLfloat)width_, 0);
    glVertex2f((GLfloat)width_, (GLfloat)height_);
    glTexCoord2f((GLfloat)width_, (GLfloat)height_);
    glVertex2f((GLfloat)width_, 0);
    glTexCoord2f(0, (GLfloat)height_);
    glVertex2f(0, 0);
    glEnd();
    glFlush();
    glFinish();
  }

  // actual test
  for (unsigned int k = 0; k < iterations_; k++) {
    if (k == 1) {
      timer_.Reset();
      timer_.Start();
    }

    glBegin(GL_QUADS);
    glTexCoord2f(0, 0);
    glVertex2f(0, (GLfloat)height_);
    glTexCoord2f((GLfloat)width_, 0);
    glVertex2f((GLfloat)width_, (GLfloat)height_);
    glTexCoord2f((GLfloat)width_, (GLfloat)height_);
    glVertex2f((GLfloat)width_, 0);
    glTexCoord2f(0, (GLfloat)height_);
    glVertex2f(0, 0);
    glEnd();
  }

  glFlush();
  glFinish();

  timer_.Stop();

  glDisable(GL_VERTEX_PROGRAM_ARB);
  glDisable(GL_FRAGMENT_PROGRAM_ARB);

  // now let's read back the pixels
  result_ = (cl_uchar *)malloc(width_ * height_ * 4);

  glReadPixels(0, 0, width_, height_, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV,
               result_);

  // bind back default frame buffer
  glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0);

  glDeleteFramebuffersEXT(1, &fbo);
  glDeleteTextures(1, &srcTexture);
  glDeleteProgramsARB(1, &vertexProgram);
  glDeleteProgramsARB(1, &fragmentProgram);
}

void OCLPerfSepia::runCL(void) {
  cl_mem dst, src;
  cl_sampler nearestZero;

  glEnable(GL_TEXTURE_RECTANGLE_EXT);
  glGenTextures(1, &texId);
  glBindTexture(GL_TEXTURE_RECTANGLE_EXT, texId);
  // XXX Alex: have GL alloc memory for us ...
  glTexImage2D(GL_TEXTURE_RECTANGLE_EXT, 0, GL_RGBA, width_, height_, 0,
               GL_RGBA /*BGRA*/, GL_UNSIGNED_INT_8_8_8_8_REV, NULL);

  dst = _wrapper->clCreateFromGLTexture2D(
      context_, CL_MEM_READ_WRITE, GL_TEXTURE_RECTANGLE_EXT, 0, texId, &error_);
  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateFromGLTexture2D error (%d)",
               error_);
  nearestZero = _wrapper->clCreateSampler(context_, CL_FALSE, CL_ADDRESS_CLAMP,
                                          CL_FILTER_NEAREST, &error_);
  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateSampler error (%d)", error_);
  src = _wrapper->clCreateImage2D(
      context_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &format_, width_,
      height_, bpr_, data_, &error_);
  CHECK_RESULT((error_ != CL_SUCCESS), "clCreateImage2D error (%d)", error_);

  int numArgs = 0;
  int dim[2] = {(int)width_, (int)height_};
  int flipped[] = {1};
  int dims[] = {(int)width_, (int)height_, 0, 0};
  float st_origin[] = {0, 0};
  float st_delta[] = {1, 0, 0, 1};

  _wrapper->clSetKernelArg(kernel_, numArgs++, sizeof(cl_mem),
                           &dst);  // arg is a image2DGL named "dst"
  _wrapper->clSetKernelArg(kernel_, numArgs++, sizeof(int),
                           &flipped);  // arg is a int1 named "flipped"
  _wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(int),
                           &dims);  // arg is a int4 named "dim"
  _wrapper->clSetKernelArg(kernel_, numArgs++, 2 * sizeof(float),
                           &st_origin);  // arg is a float2 named "st_origin"
  _wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float),
                           &st_delta);  // arg is a float4 named "st_delta"

  float l0[] = {1.0f, 0.99f, 0.92f, 1.0f};
  float l1[] = {0.5f, 0.0f, 0.0f, 0.0f};
  float l2[] = {1.0f, 0.0f, 0.0f, 0.0f};
  float l3[] = {0.0f, -1.0f, 0.0f, (float)height_};

  _wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float),
                           &l0);  // arg is a float4 named "l0"
  _wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float),
                           &l1);  // arg is a float4 named "l1"
  _wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float),
                           &l2);  // arg is a float4 named "l2"
  _wrapper->clSetKernelArg(kernel_, numArgs++, 4 * sizeof(float),
                           &l3);  // arg is a float4 named "l3"
  _wrapper->clSetKernelArg(kernel_, numArgs++, sizeof(cl_mem),
                           &src);  // arg is a image2D named "t0"
  _wrapper->clSetKernelArg(
      kernel_, numArgs++, sizeof(cl_sampler),
      &nearestZero);  // arg is a sampler named "t_sampler0"

  size_t execution_threads[2];
  size_t execution_local[2];
  cl_uint work_dim = 2;
  error_ = _wrapper->clGetKernelWorkGroupInfo(
      kernel_, devices_[_deviceId], CL_KERNEL_WORK_GROUP_SIZE,
      sizeof(execution_local[0]), &execution_local[0], 0);
  CHECK_RESULT((error_ != CL_SUCCESS), "clGetKernelWorkGroupInfo error (%d)",
               error_);
  execution_local[1] = 1;
  work_dim = 2;
  GetKernelExecDimsForImage((unsigned int)execution_local[0], dim[0], dim[1],
                            execution_threads, execution_local);
  result_ = (cl_uchar *)malloc(height_ * bpr_);

  const size_t origin[] = {0, 0, 0};
  const size_t region[] = {width_, height_, 1};

  // warm up
  for (unsigned int k = 0; k < (iterations_ / 10); k++) {
    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_,
                                              work_dim, NULL, execution_threads,
                                              execution_local, 0, NULL, NULL);
    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel error (%d)",
                 error_);
    error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
    CHECK_RESULT((error_ != CL_SUCCESS), "clFinish error (%d)", error_);
  }

  // actual test
  for (unsigned int k = 0; k < iterations_; k++) {
    if (k == 1) {
      timer_.Reset();
      timer_.Start();
    }
    error_ = _wrapper->clEnqueueNDRangeKernel(cmdQueues_[_deviceId], kernel_,
                                              work_dim, NULL, execution_threads,
                                              execution_local, 0, NULL, NULL);
    CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueNDRangeKernel error (%d)",
                 error_);
  }
  error_ = _wrapper->clFinish(cmdQueues_[_deviceId]);
  CHECK_RESULT((error_ != CL_SUCCESS), "clFinish error (%d)", error_);

  timer_.Stop();

  error_ =
      _wrapper->clEnqueueReadImage(cmdQueues_[_deviceId], dst, true, origin,
                                   region, bpr_, 0, result_, 0, NULL, NULL);
  CHECK_RESULT((error_ != CL_SUCCESS), "clEnqueueReadImage error (%d)", error_);
  _wrapper->clFinish(cmdQueues_[_deviceId]);

  _wrapper->clReleaseMemObject(src), src = NULL;
  _wrapper->clReleaseSampler(nearestZero);
  _wrapper->clReleaseMemObject(dst), dst = NULL;
}

void OCLPerfSepia::GetKernelExecDimsForImage(unsigned int work_group_size,
                                             unsigned int w, unsigned int h,
                                             size_t *global, size_t *local) {
  unsigned int a, b;
  static const unsigned int tile_size = 16;

  // local[0] and local[1] must be at least 1
  local[0] = tile_size < work_group_size ? tile_size : work_group_size;
  local[1] = work_group_size / tile_size > tile_size
                 ? tile_size
                 : MAX(work_group_size / tile_size, 1);

  a = w;
  b = (unsigned int)local[0];

  global[0] = ((a % b) != 0) ? (a / b + 1) : (a / b);
  global[0] *= local[0];

  a = h;
  b = (unsigned int)local[1];

  global[1] = ((a % b) != 0) ? (a / b + 1) : (a / b);
  global[1] *= local[1];
}

void OCLPerfSepia::run(void) {
  if (_errorFlag || silentFailure_) {
    return;
  }
  populateData();
  if (_openTest == 0) {
    runCL();
  } else {
    runGL();
  }
  if (bVerify_) {
    verifyResult();
  }
  char buf[100];
  SNPRINTF(buf, sizeof(buf), "%s iterations# %d",
           (_openTest == 0) ? "CL" : "GL", iterations_);
  testDescString = buf;
  _perfInfo = (float)timer_.GetElapsedTime();
}

void OCLPerfSepia::verifyResult(void) {
  int r = 0, g = 0, b = 0, a = 0, d = 0;
  for (unsigned int k = 0; k < height_ * bpr_; k += 4) {
    a = a + result_[k + 0];
    r = r + result_[k + 1];
    g = g + result_[k + 2];
    b = b + result_[k + 3];
  }
  d = abs(r - 152797810) + abs(g - 125868080) + abs(b - 76147833) +
      abs(a - 267386880);
  CHECK_RESULT(d > 20000, "wrong result");
}
unsigned int OCLPerfSepia::close(void) {
  if (silentFailure_) {
    return 0;
  }

  if (data_) {
    free(data_);
  }

  if (result_) {
    free(result_);
  }

  if (texId) {
    glDeleteTextures(1, &texId);
  }

  return OCLGLCommon::close();
}
