/* Copyright (c) 2010 - 2021 Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "OCLPerfKernelThroughput.h" #include #include #include #include #include #include "CL/cl.h" #include "Timer.h" #define DO_GPU_KERNELS 1 #if 0 #define ENTER(X) printf("Entering %s\n", X); #define EXIT(X) printf("Exiting %s\n", X); #define PKT(X) X #else #define ENTER(X) #define EXIT(X) #define PKT(X) #endif // work with multiples of 128 #define ROUND_MULT(VAL, MULT) ((VAL / MULT) * MULT) /* int roundUp( int numToRound, int multiple) { int r = numToRound % multiple; if (r == 0) { return numToRound; } else { return numToRound + multiple - remainder; } } */ // quiety warnings #ifdef WIN_OS #define SNPRINTF sprintf_s #else #define SNPRINTF snprintf #endif #define WORK_GROUP_SIZE 256 /******************************************************************************* * Enumerated Types for Tests ******************************************************************************/ // memory operations const LARGE_INT numKernelTypes = 2; static const char *kernelType[numKernelTypes] = {"MatMul", "Madds"}; // source/read memory locations const LARGE_INT numMemPaths = 2; static const char *memPath[numMemPaths] = {"Host", "Device"}; // buffer size const LARGE_INT numNumElements = 12; // 15; static const LARGE_INT numElements[numNumElements] = { 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216 //, // 67108864, // 268435456 }; // flops/byte const LARGE_INT numWorkSizes = 5; static const LARGE_INT workSize[numWorkSizes] = {1, 4, 16, 64, 256}; const float initFloat = 0.001f; const float zeroFloat = 0.0f; #define WORK_GROUP_SIZE 256 /******************************************************************************* * Write the Matrix Multiply Shader Kernel ******************************************************************************/ void OCLPerfKernelThroughput::genShaderMatrixMultiply() { ENTER("genShaderMatrixMultiply"); std::stringstream ss; ss.clear(); #if 0 printf("%ix%i * %ix%i = %ix%i:\n", matrixDim1_, matrixDim2_, matrixDim2_, matrixDim1_, matrixDim1_, matrixDim1_ ); #endif ss << "#define BLOCK_SIZE 16\n" "#define HA " << matrixDim1_ << "\n" "#define WA " << matrixDim2_ << "\n" "#define HB WA\n" "#define WB HA\n" "#define HC HA\n" "#define WC WB\n" "__kernel void\n" "__attribute__((reqd_work_group_size(16,16,1)))\n" "kernel1(\n" " __global float * restrict C,\n" " __global float * restrict A,\n" " __global float * restrict B )\n" "{\n" " int bx = get_group_id(0);\n" " int by = get_group_id(1);\n" " int tx = get_local_id(0);\n" " int ty = get_local_id(1);\n" " int aBegin = WA * BLOCK_SIZE * by;\n" " int aEnd = aBegin + WA - 1;\n" " int aStep = BLOCK_SIZE;\n" " int bBegin = BLOCK_SIZE * bx;\n" " int bStep = BLOCK_SIZE * WB;\n" " __private float c = 0.f;\n" " __local float localA[BLOCK_SIZE][BLOCK_SIZE];\n" " __local float localB[BLOCK_SIZE][BLOCK_SIZE];\n" " for (\n" " int a = aBegin, b = bBegin;\n" " a <= aEnd;\n" " a += aStep, b += bStep)\n" " {\n" " localA[ty][tx] = (get_global_id(0) < WA && get_global_id(1) < " "HA) ? A[a + WA * ty + tx] : 0;\n" " localB[ty][tx] = (get_global_id(0) < WB && get_global_id(1) < " "HB) ? B[b + WB * ty + tx] : 0;\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" " for (int k = 0; k < BLOCK_SIZE; ++k)\n" " c += localA[ty][k] * localB[k][tx];\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" " }\n" " int cIdx = WB * BLOCK_SIZE * by + BLOCK_SIZE * bx + WB * ty + tx;\n" " if (get_global_id(0) < WC && get_global_id(1) < WC)\n" " {\n" " C[cIdx] = c;\n" " }\n" "}\n"; shader_ = ss.str(); gold_ = 0.f; for (int i = 0; i < matrixDim2_; i++) gold_ += initFloat * initFloat; // gold_ = initFloat * initFloat * matrixDim2_; // printf("shader:\n%s\n", shader_.c_str()); // printf("gold_: %f\n", gold_); EXIT("genShaderMatrixMultiply"); } /******************************************************************************* * Write the Madds Shader Kernel ******************************************************************************/ void OCLPerfKernelThroughput::genShaderMadds() { ENTER("genShaderMadds"); int flopLoopIter = 2 * (flopsPerByte_ * 4 * 4) / 16; // bytes, flops std::stringstream ss; ss.clear(); float a, b; ss << // begin kernel "__kernel void\n" "__attribute__((reqd_work_group_size(" << 256 << ",1,1)))\n" "kernel1(\n" " __global float4 * restrict input,\n" " __global float4 * restrict output )\n" "{\n"; // begin loop ss << " for ( uint idx = get_global_id(0);\n" " idx < " << numElements[numElementsIdx_] << ";\n" " idx += get_global_size(0) )\n" " {\n"; // do load ss << " float4 prefetch = input[ idx ];\n" " float a0 = prefetch.x;\n" " float a1 = prefetch.y;\n" " float a2 = prefetch.z;\n" " float a3 = prefetch.w;\n" " float b0 = a0;\n" " float b1 = a1;\n" " float b2 = a2;\n" " float b3 = a3;\n"; a = initFloat; b = a; // do math for (int i = 0; i < flopLoopIter; i++) { ss << " a0 += b3*b1;\n" " a1 += b0*b2;\n" " a2 += b1*b3;\n" " a3 += b2*b0;\n" " b0 += a3*a1;\n" " b1 += a0*a2;\n" " b2 += a1*a3;\n" " b3 += a2*a0;\n"; // printf("a += b*b; %f += %f*%f\n", a, b, b); a += b * b; // printf("b += a*a; %f += %f*%f\n", b, a, a); b += a * a; } // do write or accumulate ss << " __private float4 tmp;\n" " tmp.x = b0;\n" " tmp.y = b1;\n" " tmp.z = b2;\n" " tmp.w = b3;\n" " output[ idx ] = tmp;\n"; gold_ = b; // printf("GPU gold_ Tmp: %f\n", gold_); // end loop ss << " } // end loop\n"; // end kernel ss << " } // end kernel\n\n"; shader_ = ss.str(); // printf("shader:\n%s\n", shader_.c_str()); // printf("gold_: %f\n", gold_); EXIT("genShaderMadds"); } static void CL_CALLBACK notify_callback(const char *errinfo, const void *private_info, size_t cb, void *user_data) {} /******************************************************************************* * Constructor ******************************************************************************/ OCLPerfKernelThroughput::OCLPerfKernelThroughput() { ENTER("constructor"); _numSubTests = numKernelTypes * numMemPaths * numNumElements * numWorkSizes; cl_uint numPlatforms; cl_platform_id platform = NULL; cl_uint num_devices = 0; cl_device_id *devices = NULL; cl_device_id device = NULL; context_ = 0; error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); if (0 < numPlatforms) { cl_platform_id *platforms = new cl_platform_id[numPlatforms]; error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); // Get last for default platform = platforms[numPlatforms - 1]; for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; error_ = _wrapper->clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); num_devices = 0; /* Get the number of requested devices */ error_ = _wrapper->clGetDeviceIDs(platforms[i], type_, 0, NULL, &num_devices); // Runtime returns an error when no GPU devices are present // instead of just returning 0 devices // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); // Choose platform with GPU devices if (num_devices > 0) { // printf("NumDevices: %i\n", num_devices); platform = platforms[i]; break; } } delete platforms; } /* * If we could find our platform, use it, else die. */ CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); CHECK_RESULT(devices == 0, "no devices"); /* Get the requested device */ error_ = _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); CHECK_RESULT(_deviceId >= num_devices, "Requested deviceID not available"); device = devices[_deviceId]; // get gpu speed error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(maxClockFrequency_), &maxClockFrequency_, NULL); CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(maxComputeUnits_), &maxComputeUnits_, NULL); CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); if (maxComputeUnits_ > 8) { // printf("%i CUs reported; assuming 8 instead.", maxComputeUnits_); maxComputeUnits_ = 8; } // printf("Compute Units: %i\n", maxComputeUnits_); // printf("Subtests: %i\n", _numSubTests); // create context context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, &error_); CHECK_RESULT(context_ == 0, "clCreateContext failed"); char charbuf[1024]; size_t retsize; error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 1024, charbuf, &retsize); CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); if (context_) { error_ = _wrapper->clReleaseContext(context_); CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); } cl_uint tmp; error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(tmp), &tmp, NULL); CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clGetDeviceInfo failed"); // printf("NumComputeUnits: %u\n", tmp); maxComputeUnits_ = static_cast(tmp); // printf("NumComputeUnits: %lld\n", maxComputeUnits_); EXIT("constructor"); } OCLPerfKernelThroughput::~OCLPerfKernelThroughput() {} /******************************************************************************* * Open - initializes test, compile GPU kernel ******************************************************************************/ void OCLPerfKernelThroughput::open(unsigned int test, char *units, double &conversion, unsigned int deviceId) { ENTER("open"); /*********************************************************** * select subtest **********************************************************/ int testIdx = test + numKernelTypes * numMemPaths * numNumElements * numWorkSizes; memPathIdx_ = testIdx % numMemPaths; testIdx /= numMemPaths; numElementsIdx_ = testIdx % numNumElements; testIdx /= numNumElements; workSizeIdx_ = testIdx % numWorkSizes; testIdx /= numWorkSizes; kernelTypeIdx_ = testIdx % numKernelTypes; testIdx /= numKernelTypes; // float md1; // kernel values switch (kernelTypeIdx_) { case 0: // Matrix Multiply // md1 = sqrt(1.f*numElements[numElementsIdx_]); // printf("MD1: sqrt(%f) = %f\n", 1.f*numElements[numElementsIdx_],md1); matrixDim1_ = static_cast(sqrt(1.f * numElements[numElementsIdx_])); matrixDim2_ = matrixDim1_ * (int)workSize[workSizeIdx_]; genShaderMatrixMultiply(); work_dim_ = 2; global_work_size_ = new size_t[work_dim_]; global_work_size_[0] = ((matrixDim1_ - 1) / 16 + 1) * 16; // matrixDim1_ < 16 ? 16 : matrixDim1_; global_work_size_[1] = global_work_size_[0]; local_work_size_ = new size_t[work_dim_]; local_work_size_[0] = 16; local_work_size_[1] = local_work_size_[0]; /* printf("Global: %ix%i; Local: %ix%i; Matrix: %ix%i\n", global_work_size_[0], global_work_size_[1], local_work_size_[0], local_work_size_[1], matrixDim1_, matrixDim2_ ); */ input1BufferSize_ = static_cast(matrixDim1_ * matrixDim2_ * sizeof(float)); input2BufferSize_ = static_cast(matrixDim2_ * matrixDim1_ * sizeof(float)); output1BufferSize_ = static_cast(matrixDim1_ * matrixDim1_ * sizeof(float)); _reqDataSize = (1.0 * matrixDim1_ * matrixDim2_ * sizeof(float)) + (1.0 * matrixDim2_ * matrixDim1_ * sizeof(float)) + (1.0 * matrixDim1_ * matrixDim1_ * sizeof(float)); break; case 1: // Flops/Byte flopsPerByte_ = (int)workSize[workSizeIdx_]; // for kernelType == 0 genShaderMadds(); numWorkGroupsPerComputeUnit_ = 32; // TODO numThreads_ = numWorkGroupsPerComputeUnit_ * maxComputeUnits_ * WORK_GROUP_SIZE; work_dim_ = 1; global_work_size_ = new size_t[work_dim_]; local_work_size_ = new size_t[work_dim_]; global_work_size_[0] = numThreads_; local_work_size_[0] = WORK_GROUP_SIZE; input1BufferSize_ = static_cast(numElements[numElementsIdx_] * sizeof(float4)); input2BufferSize_ = 0; output1BufferSize_ = static_cast(numElements[numElementsIdx_] * sizeof(float4)); _reqDataSize = 2.0 * numElements[numElementsIdx_] * sizeof(float4); break; } PKT(printf("Test Parameters:\n" "\tkernelTypeIdx: %i\n" "\tmemPathIdx: %i\n" "\tnumElementsIdx: %i\n" "\tworkSizeIdx: %i\n" "\n\n", kernelTypeIdx_, memPathIdx_, numElementsIdx_, workSizeIdx_);) /*********************************************************** * get context and queue **********************************************************/ cl_uint numPlatforms; cl_platform_id platform = NULL; cl_uint num_devices = 0; cl_device_id *devices = NULL; cl_device_id device = NULL; _crcword = 0; conversion = 1.0; _deviceId = deviceId; context_ = 0; cmd_queue_ = 0; program_ = 0; kernel_ = 0; input1Buffer_ = 0; output1Buffer_ = 0; _errorFlag = false; // Reset error code so a single error // doesn't prevent other subtests from running _errorMsg = ""; error_ = _wrapper->clGetPlatformIDs(0, NULL, &numPlatforms); CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); if (0 < numPlatforms) { cl_platform_id *platforms = new cl_platform_id[numPlatforms]; error_ = _wrapper->clGetPlatformIDs(numPlatforms, platforms, NULL); CHECK_RESULT(error_ != CL_SUCCESS, "clGetPlatformIDs failed"); platform = platforms[_platformIndex]; char pbuf[100]; error_ = _wrapper->clGetPlatformInfo(platforms[_platformIndex], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); num_devices = 0; /* Get the number of requested devices */ error_ = _wrapper->clGetDeviceIDs(platforms[_platformIndex], type_, 0, NULL, &num_devices); // Runtime returns an error when no GPU devices are present // instead of just returning 0 devices // CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); delete platforms; } /* * If we could find our platform, use it, else die. */ CHECK_RESULT(platform == 0, "Couldn't find AMD platform, cannot proceed"); devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); CHECK_RESULT(devices == 0, "no devices"); /* * Get the requested device */ error_ = _wrapper->clGetDeviceIDs(platform, type_, num_devices, devices, NULL); CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs failed"); device = devices[0]; context_ = _wrapper->clCreateContext(NULL, 1, &device, notify_callback, NULL, &error_); CHECK_RESULT(context_ == 0, "clCreateContext failed"); cmd_queue_ = _wrapper->clCreateCommandQueue(context_, device, CL_QUEUE_PROFILING_ENABLE, NULL); CHECK_RESULT(cmd_queue_ == 0, "clCreateCommandQueue failed"); // Global memory size cl_ulong _maxMemoryAllocationSize; error_ = _wrapper->clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &_maxMemoryAllocationSize, NULL); CHECK_RESULT(error_ != CL_SUCCESS, "clGetDeviceIDs(CL_DEVICE_GLOBAL_MEM_SIZE) failed"); #if 0 printf("Buffer Sizes: %i %i %i = %f\n", input1BufferSize_, input2BufferSize_, output1BufferSize_, _reqDataSize); #endif _dataSizeTooBig = (_reqDataSize > _maxMemoryAllocationSize); if (_dataSizeTooBig) { // printf("DATA TOO LARGE FOR DEVICE !!!"); return; } // create kernel char *tmp = (char *)shader_.c_str(); program_ = _wrapper->clCreateProgramWithSource( context_, 1, (const char **)&tmp, NULL, &error_); CHECK_RESULT(program_ == 0, "clCreateProgramWithSource failed"); std::string args; args.clear(); error_ = _wrapper->clBuildProgram(program_, 1, &device, args.c_str(), NULL, NULL); if (error_ != CL_SUCCESS) { cl_int intError; char log[16384]; intError = _wrapper->clGetProgramBuildInfo(program_, device, CL_PROGRAM_BUILD_LOG, 16384 * sizeof(char), log, NULL); printf("Build error -> %s\n", log); CHECK_RESULT(0, "clBuildProgram failed"); } kernel_ = _wrapper->clCreateKernel(program_, "kernel1", &error_); CHECK_RESULT(kernel_ == 0, "clCreateKernel failed"); /*********************************************************** * Allocate GPU Memory **********************************************************/ cl_mem_flags inputBufferFlags = 0; cl_mem_flags outputBufferFlags = 0; // choose gpu source buffer type switch (memPathIdx_) { case 0: // host memory // printf("Allocating Host Memories\n"); // allocate "device" memory inputBufferFlags = CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR; outputBufferFlags = CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR; input1Buffer_ = _wrapper->clCreateBuffer( context_, inputBufferFlags, input1BufferSize_, NULL, &error_); CHECK_RESULT(input1Buffer_ == 0, "clCreateBuffer Input failed"); if (input1Buffer_ == 0) printf("Error: %i\n", error_); if (input2BufferSize_) { input2Buffer_ = _wrapper->clCreateBuffer( context_, inputBufferFlags, input2BufferSize_, NULL, &error_); CHECK_RESULT(input2Buffer_ == 0, "clCreateBuffer Input failed"); } output1Buffer_ = _wrapper->clCreateBuffer( context_, outputBufferFlags, output1BufferSize_, NULL, &error_); CHECK_RESULT(output1Buffer_ == 0, "clCreateBuffer Input failed"); if (output1Buffer_ == 0) printf("Error: %i\n", error_); // map host memory input1Ptr_ = (float *)_wrapper->clEnqueueMapBuffer( cmd_queue_, input1Buffer_, true, CL_MAP_WRITE, 0, input1BufferSize_, 0, NULL, NULL, &error_); if (input2BufferSize_) { input2Ptr_ = (float *)_wrapper->clEnqueueMapBuffer( cmd_queue_, input2Buffer_, true, CL_MAP_WRITE, 0, input2BufferSize_, 0, NULL, NULL, &error_); } output1Ptr_ = (float *)_wrapper->clEnqueueMapBuffer( cmd_queue_, output1Buffer_, true, CL_MAP_READ, 0, output1BufferSize_, 0, NULL, NULL, &error_); _wrapper->clFinish(cmd_queue_); break; case 1: // device memory // printf("Allocating Device Memories\n"); // allocate device memory inputBufferFlags = CL_MEM_READ_WRITE; outputBufferFlags = CL_MEM_READ_WRITE; input1Buffer_ = _wrapper->clCreateBuffer( context_, inputBufferFlags, input1BufferSize_, NULL, &error_); CHECK_RESULT(input1Buffer_ == 0, "clCreateBuffer Input failed"); if (input2BufferSize_) { input2Buffer_ = _wrapper->clCreateBuffer( context_, inputBufferFlags, input2BufferSize_, NULL, &error_); CHECK_RESULT(input2Buffer_ == 0, "clCreateBuffer Input failed"); } output1Buffer_ = _wrapper->clCreateBuffer( context_, outputBufferFlags, output1BufferSize_, NULL, &error_); CHECK_RESULT(output1Buffer_ == 0, "clCreateBuffer Input failed"); // printf("\tDone Allocating Device Memory\n"); // allocate host memory input1Ptr_ = new float[input1BufferSize_ / sizeof(float)]; if (input2BufferSize_) { input2Ptr_ = new float[input2BufferSize_ / sizeof(float)]; } output1Ptr_ = new float[output1BufferSize_ / sizeof(float)]; // printf("\tDone Allocating Host Memory\n"); break; default: CHECK_RESULT(1, "Invalid Memory Path Idx"); // invalid } for (unsigned int i = 0; i < input1BufferSize_ / sizeof(float); i++) { input1Ptr_[i] = initFloat; } for (unsigned int i = 0; i < input2BufferSize_ / sizeof(float); i++) { input2Ptr_[i] = initFloat; } for (unsigned int i = 0; i < output1BufferSize_ / sizeof(float); i++) { output1Ptr_[i] = zeroFloat; } #if 0 printf("Allocating GPU: %.0fMB, %.0fMB\n", static_cast(1.f*input1BufferSize_/1024.f/1024.f), static_cast(1.f*output1BufferSize_/1024.f/1024.f)); input1Buffer_ = _wrapper->clCreateBuffer( context_, inputBufferFlags, input1BufferSize_, NULL, &error_); CHECK_RESULT(input1Buffer_ == 0, "clCreateBuffer Input failed"); output1Buffer_ = _wrapper->clCreateBuffer( context_, outputBufferFlags, output1BufferSize_, NULL, &error_); CHECK_RESULT(output1Buffer_ == 0, "clCreateBuffer Output failed"); error_ = /*_wrapper->*/clEnqueueFillBuffer( cmd_queue_, input1Buffer_, &initFloat, sizeof(initFloat), 0, input1BufferSize_, 0, NULL, NULL); CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueFillBuffer failed"); error_ = /*_wrapper->*/clEnqueueFillBuffer( cmd_queue_, output1Buffer_, &zeroFloat, sizeof(zeroFloat), 0, output1BufferSize_, 0, NULL, NULL); CHECK_RESULT(error_ != CL_SUCCESS, "clEnqueueFillBuffer failed"); /*********************************************************** * Set Kernel Args **********************************************************/ error_ = _wrapper->clSetKernelArg( kernel_, 0, sizeof(input1Buffer_), (void *) &input1Buffer_); CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); error_ = _wrapper->clSetKernelArg( kernel_, 1, sizeof(output1Buffer_), (void *) &output1Buffer_); CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); #endif EXIT("open"); } /******************************************************************************* * Run - execute full test once and return performance ******************************************************************************/ void OCLPerfKernelThroughput::run(void) { ENTER("run"); CPerfCounter timer; if (!_dataSizeTooBig) { // set kernel args #if 1 switch (kernelTypeIdx_) { case 0: // Matrix Multiply error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(output1Buffer_), (void *)&output1Buffer_); CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(input1Buffer_), (void *)&input1Buffer_); CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); error_ = _wrapper->clSetKernelArg(kernel_, 2, sizeof(input2Buffer_), (void *)&input2Buffer_); CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); break; case 1: // Flops/Byte error_ = _wrapper->clSetKernelArg(kernel_, 0, sizeof(input1Buffer_), (void *)&input1Buffer_); CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); error_ = _wrapper->clSetKernelArg(kernel_, 1, sizeof(output1Buffer_), (void *)&output1Buffer_); CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); break; } #endif launchKernel(); timer.Reset(); timer.Start(); for (int i = 0; i < MAX_LOOP_ITER; i++) { launchKernel(); } timer.Stop(); } // data not too large double totalSec = _dataSizeTooBig ? 1 : timer.GetElapsedTime(); // printf("Total Time: %f seconds\n", totalSec); // printf("Average Kernel Time: %f seconds\n", totalSec / MAX_LOOP_ITER); // analyze performance avgKernelTime_ = (float)(totalSec / MAX_LOOP_ITER * 1000000); // microseconds double flopCount; switch (kernelTypeIdx_) { case 0: // Matrix Multiply flopCount = (2.0 * matrixDim1_ * matrixDim1_ * matrixDim2_); // printf("FlopCount = 2*%i*%i*%i=%f\n", // matrixDim1_,matrixDim1_,matrixDim2_,flopCount); bandwidth_ = (float)(1.f * _reqDataSize / 1024.f / 1024.f / 1024.f) * 1000000.f / avgKernelTime_; // GB/s gflops_ = (float)(1000000.f * flopCount / avgKernelTime_ / 1000000000.0); break; case 1: // Madds flopCount = _reqDataSize * flopsPerByte_; bandwidth_ = (float)(1.f * _reqDataSize / 1024.f / 1024.f / 1024.f) * 1000000.f / avgKernelTime_; // GB/s gflops_ = bandwidth_ * flopsPerByte_; break; } if (_dataSizeTooBig) { printf("REQUESTED DATA SIZE EXCEEDS GLOBAL MEMORY !!!\n"); bandwidth_ = 0; gflops_ = 0; avgKernelTime_ = 0; } // here print out details char buf[512]; int bytesWritten; bytesWritten = SNPRINTF( buf, sizeof(buf), "Kernel:%7s; " "Work:%4i; " "Buff:%11.0f; " "Path:%7s; " "%10.5e GB/s; " "%10.5e GFlop/s; ", kernelType[kernelTypeIdx_], static_cast(workSize[workSizeIdx_]), _reqDataSize, memPath[memPathIdx_], bandwidth_, gflops_); testDescString = buf; _perfInfo = avgKernelTime_; if (!_dataSizeTooBig) checkData(); EXIT("run"); } void OCLPerfKernelThroughput::launchKernel(void) { ENTER("launchKernel") /*********************************************************** * Copy Data To **********************************************************/ // printf("Copying Data To Device\n"); switch (memPathIdx_) { case 0: // zero copy // do nothing // void *inputPtr = _wrapper->clEnqueueMapBuffer( // cmd_queue_, input1Buffer_, true, CL_MAP_READ, // 0, input1BufferSize_, 0, NULL, NULL, &error_); // void *outputPtr = _wrapper->clEnqueueMapBuffer( // cmd_queue_, output1Buffer_, true, CL_MAP_READ, // 0, output1BufferSize_, 0, NULL, NULL, &error_); //_wrapper->clFinish(cmd_queue_); break; case 1: // explicit copy to device memory // printf("Queue: %p\n", &cmd_queue_); // printf("devBuffer: %i\n", input1Buffer_); // printf("hstBuffer: %p\n", input1Ptr_); // printf("bufSize: %i\n", input1BufferSize_); error_ = _wrapper->clEnqueueWriteBuffer( cmd_queue_, input1Buffer_, true, 0, input1BufferSize_, (const void *)input1Ptr_, 0, NULL, NULL); if (input2BufferSize_) { error_ = _wrapper->clEnqueueWriteBuffer( cmd_queue_, input2Buffer_, true, 0, input2BufferSize_, (const void *)input2Ptr_, 0, NULL, NULL); } // printf("Error: %i\n", error_); std::fflush(stdout); _wrapper->clFinish(cmd_queue_); CHECK_RESULT(error_ != CL_SUCCESS, "clWriteBuffer failed"); //_error = _wrapper->clEnqueueWriteBuffer( // cmd_queue_, output1Buffer_, true, 0, output1BufferSize_, // (const void *)output1Ptr_, 0, NULL, NULL ); // CHECK_RESULT(error_ != CL_SUCCESS, "clWriteBuffer failed"); break; } /*********************************************************** * Set Kernel Args **********************************************************/ #if 0 error_ = _wrapper->clSetKernelArg( kernel_, 0, sizeof(input1Buffer_), (void *) &input1Buffer_); CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); error_ = _wrapper->clSetKernelArg( kernel_, 1, sizeof(output1Buffer_), (void *) &output1Buffer_); CHECK_RESULT(error_ != CL_SUCCESS, "clSetKernelArg failed"); #endif // printf("Launching Kernel: %ix%i threads\n", global_work_size_[0], // local_work_size_[0]); /*********************************************************** * Launch Kernel **********************************************************/ error_ = _wrapper->clEnqueueNDRangeKernel( cmd_queue_, kernel_, work_dim_, NULL, (const size_t *)global_work_size_, (const size_t *)local_work_size_, 0, NULL, NULL); // printf("Error: %i\n", error_); CHECK_RESULT(error_, "clEnqueueNDRangeKernel failed"); _wrapper->clFinish(cmd_queue_); /*********************************************************** * Copy Data From **********************************************************/ // printf("Copying Data From Device\n"); switch (memPathIdx_) { case 0: // zero copy // do nothing // void *inputPtr = _wrapper->clEnqueueMapBuffer( // cmd_queue_, input1Buffer_, true, CL_MAP_READ, // 0, input1BufferSize_, 0, NULL, NULL, &error_); // void *outputPtr = _wrapper->clEnqueueMapBuffer( // cmd_queue_, output1Buffer_, true, CL_MAP_READ, // 0, output1BufferSize_, 0, NULL, NULL, &error_); //_wrapper->clFinish(cmd_queue_); break; case 1: // explicit copy to device memory //_error = _wrapper->clEnqueueReadBuffer( // cmd_queue_, input1Buffer_, true, 0, input1BufferSize_, // (void *)input1Ptr_, 0, NULL, NULL ); // CHECK_RESULT(error_ != CL_SUCCESS, "clWriteBuffer failed"); // printf("VAL0 %p error_ = _wrapper->clEnqueueReadBuffer( cmd_queue_, output1Buffer_, true, 0, output1BufferSize_, (void *)output1Ptr_, 0, NULL, NULL); // printf("Error: %i\n", error_); CHECK_RESULT(error_ != CL_SUCCESS, "clWriteBuffer failed"); break; } EXIT("launchKernel") } /******************************************************************************* * Check Data ******************************************************************************/ void OCLPerfKernelThroughput::checkData() { _wrapper->clFinish(cmd_queue_); float errorThreshhold = 0.00001f; float eqMax = gold_ + errorThreshhold * gold_; float eqMin = gold_ - errorThreshhold * gold_; /* printf("%ix%i * %ix%i = %ix%i:\n", matrixDim1_, matrixDim2_, matrixDim2_, matrixDim1_, matrixDim1_, matrixDim1_ ); */ for (unsigned int i = 0; i < output1BufferSize_ / sizeof(float); i++) { float value = output1Ptr_[i]; bool equal = (value > eqMin && value < eqMax); if (!equal) { #if 0 printf("Output[%i] = %.6e; gold_ = %.6e; %s\n", i, value, gold_, equal ? "Equal" : "NOT Equal"); #endif // printf("FAILURE\n"); // CHECK_RESULT_NO_RETURN(1, "Data validation failed!\n"); _errorFlag = true; break; } else { // printf("M[%i] = %.6e\n", i, output1Ptr_[i]); } } } /******************************************************************************* * Close - delete all data and release opencl objects ******************************************************************************/ unsigned int OCLPerfKernelThroughput::close(void) { ENTER("close"); _wrapper->clFinish(cmd_queue_); if (global_work_size_) { delete[] global_work_size_; global_work_size_ = NULL; } if (local_work_size_) { delete[] local_work_size_; local_work_size_ = NULL; } // switch for memory type switch (memPathIdx_) { case 0: // zero copy // unmap ptr if (input1Ptr_) { error_ = /*_wrapper->*/ clEnqueueUnmapMemObject( cmd_queue_, input1Buffer_, input1Ptr_, 0, NULL, NULL); CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clEnqueueUnmapMemObject(input_) failed"); _wrapper->clFinish(cmd_queue_); error_ = _wrapper->clReleaseMemObject(input1Buffer_); CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject(input1Buffer_) failed"); input1Buffer_ = 0; } if (input2Ptr_) { error_ = /*_wrapper->*/ clEnqueueUnmapMemObject( cmd_queue_, input2Buffer_, input2Ptr_, 0, NULL, NULL); CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clEnqueueUnmapMemObject(input_) failed"); _wrapper->clFinish(cmd_queue_); error_ = _wrapper->clReleaseMemObject(input2Buffer_); CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject(input2Buffer_) failed"); input2Buffer_ = 0; } if (output1Ptr_) { error_ = /*_wrapper->*/ clEnqueueUnmapMemObject( cmd_queue_, output1Buffer_, output1Ptr_, 0, NULL, NULL); CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clEnqueueUnmapMemObject(output_) failed"); _wrapper->clFinish(cmd_queue_); error_ = _wrapper->clReleaseMemObject(output1Buffer_); CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject(input1Buffer_) failed"); output1Buffer_ = 0; } break; case 1: // explicit copy to device memory // release object if (input1Buffer_) { error_ = _wrapper->clReleaseMemObject(input1Buffer_); CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject(input1Buffer_) failed"); input1Buffer_ = 0; } if (input2Buffer_) { error_ = _wrapper->clReleaseMemObject(input2Buffer_); CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject(input2Buffer_) failed"); input2Buffer_ = 0; } if (output1Buffer_) { error_ = _wrapper->clReleaseMemObject(output1Buffer_); CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseMemObject(input1Buffer_) failed"); output1Buffer_ = 0; } if (input1Ptr_) { delete[] input1Ptr_; input1Ptr_ = 0; } if (input2Ptr_) { delete[] input2Ptr_; input2Ptr_ = 0; } if (output1Ptr_) { delete[] output1Ptr_; output1Ptr_ = 0; } break; } if (kernel_) { error_ = _wrapper->clReleaseKernel(kernel_); CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseKernel failed"); kernel_ = 0; } if (program_) { error_ = _wrapper->clReleaseProgram(program_); CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseProgram failed"); program_ = 0; } if (cmd_queue_) { error_ = _wrapper->clReleaseCommandQueue(cmd_queue_); CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseCommandQueue failed"); cmd_queue_ = 0; } if (context_) { error_ = _wrapper->clReleaseContext(context_); CHECK_RESULT_NO_RETURN(error_ != CL_SUCCESS, "clReleaseContext failed"); context_ = 0; } _wrapper->clFinish(cmd_queue_); EXIT("close"); return _crcword; }