/******************************************************************************* * * MIT License * * Copyright (c) 2020 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * *******************************************************************************/ #ifndef GUARD_MIOPEN_REDUCE_DRIVER_HPP #define GUARD_MIOPEN_REDUCE_DRIVER_HPP #include "../test/verify.hpp" #include "InputFlags.hpp" #include "driver.hpp" #include "tensor_driver.hpp" #include "timer.hpp" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "random.hpp" #include "miopen_Reduction.hpp" template class ReduceDriver : public Driver { public: ReduceDriver() : Driver() { miopenCreateTensorDescriptor(&inputTensor); miopenCreateTensorDescriptor(&outputTensor); miopenCreateReduceTensorDescriptor(&reduceDesc); if(std::is_same::value) data_type = miopenDouble; else data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf; } int AddCmdLineArgs() override; int ParseCmdLineArgs(int argc, char* argv[]) override; InputFlags& GetInputFlags() override { return inflags; } int GetandSetData() override; std::vector GetInputTensorLengthsFromCmdLine(); std::vector GetDimsToReduceFromCmdLine(); int SetReduceTensorDescriptorFromCmdLineArgs(); int AllocateBuffersAndCopy() override; int RunForwardGPU() override; int RunForwardCPU(); int RunBackwardGPU() override; int RunBackwardCPU(); int VerifyBackward() override; int VerifyForward() override; ~ReduceDriver() override { miopenDestroyTensorDescriptor(outputTensor); miopenDestroyTensorDescriptor(inputTensor); miopenDestroyReduceTensorDescriptor(reduceDesc); } private: InputFlags inflags; miopenTensorDescriptor_t inputTensor; miopenTensorDescriptor_t outputTensor; std::vector dimsToReduce; std::vector dimsInvariant; std::unique_ptr in_dev; std::unique_ptr out_dev; std::unique_ptr ws_dev; std::unique_ptr indices_dev; std::vector in; std::vector out; std::vector outhost; std::vector out_indices; std::vector outhost_indices; bool need_indices; std::size_t ws_sizeInBytes; std::size_t indices_sizeInBytes; miopenReduceTensorDescriptor_t reduceDesc; }; template int ReduceDriver::ParseCmdLineArgs(int argc, char* argv[]) { inflags.Parse(argc, argv); if(inflags.GetValueInt("time") == 1) { miopenEnableProfiling(GetHandle(), true); } return 0; } template int ReduceDriver::GetandSetData() { std::vector inLengths = GetInputTensorLengthsFromCmdLine(); std::vector toReduceDims = GetDimsToReduceFromCmdLine(); std::vector outLengths = inLengths; std::vector invariantDims; assert(toReduceDims.size() <= inLengths.size()); for(int i = 0; i < toReduceDims.size(); i++) assert(toReduceDims[i] < inLengths.size()); // set the lengths of the dimensions to be reduced to 1 to represent the output Tensor for(int i = 0; i < toReduceDims.size(); i++) outLengths[toReduceDims[i]] = 1; SetTensorNd(inputTensor, inLengths, data_type); SetTensorNd(outputTensor, outLengths, data_type); SetReduceTensorDescriptorFromCmdLineArgs(); this->dimsToReduce = toReduceDims; for(int i = 0; i < inLengths.size(); i++) if(inLengths[i] == outLengths[i]) invariantDims.push_back(i); this->dimsInvariant = invariantDims; return (0); } template int ReduceDriver::AddCmdLineArgs() { inflags.AddInputFlag("forw", 'F', "1", "Run only Forward (Default=1)", "int"); inflags.AddInputFlag("DimLengths", 'D', "100,60,16,240", "The dimensional lengths of the input tensor", "string"); inflags.AddInputFlag( "DimsToReduce", 'R', "0,2", "The indices of the dimensions to be reduced", "string"); inflags.AddInputFlag("ReduceOp", 'O', "0,2", "Reduction Operation Type (check the enum miopenReduceTensorOp_t in " "miopen.h) (Default=0 to represent Add of two values)", "int"); inflags.AddInputFlag("CompType", 'C', "1", "The computation type of the Reduce operation (check the enum " "miopenDataType_t in miopen.h) (Default=1(Float)", "int"); inflags.AddInputFlag("NanPropagation", 'N', "0", "Nan number propagation mode (check the miopenNanPropagation_t in " "miopen.h) (Default=0 to indicate no Nan propagation)", "int"); inflags.AddInputFlag("IndicesUsed", 'I', "0,1", "whether indices of the reduced values are outputed when Min/Max " "operation is used (Default=0 to indicate no indices outputed)", "int"); inflags.AddInputFlag("alpha", 'A', "1.0", "Scale factor for input tensor", "double"); inflags.AddInputFlag("beta", 'B', "0.0", "Scale factor for output tensor", "double"); inflags.AddInputFlag( "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); inflags.AddInputFlag("iter", 'i', "1", "Number of Iterations (Default=1)", "int"); inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int"); inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); inflags.AddInputFlag("dump_output", 'o', "0", "Dumps the output buffers (Default=0)", "int"); inflags.AddInputFlag("in_data", 'd', "", "Input data filename (Default=)", "string"); return 0; } template std::vector ReduceDriver::GetInputTensorLengthsFromCmdLine() { std::string lengthsStr = inflags.GetValueStr("DimLengths"); std::vector lengths; std::size_t pos = 0; std::size_t new_pos; new_pos = lengthsStr.find(',', pos); while(new_pos != std::string::npos) { std::string sliceStr = lengthsStr.substr(pos, new_pos - pos); int len = std::stoi(sliceStr); lengths.push_back(len); pos = new_pos + 1; new_pos = lengthsStr.find(',', pos); }; std::string sliceStr = lengthsStr.substr(pos); int len = std::stoi(sliceStr); lengths.push_back(len); return (lengths); } template std::vector ReduceDriver::GetDimsToReduceFromCmdLine() { std::string lengthsStr = inflags.GetValueStr("DimsToReduce"); std::vector lengths; std::size_t pos = 0; std::size_t new_pos; new_pos = lengthsStr.find(',', pos); while(new_pos != std::string::npos) { std::string sliceStr = lengthsStr.substr(pos, new_pos - pos); int len = std::stoi(sliceStr); lengths.push_back(len); pos = new_pos + 1; new_pos = lengthsStr.find(',', pos); }; std::string sliceStr = lengthsStr.substr(pos); int len = std::stoi(sliceStr); lengths.push_back(len); return (lengths); } template int ReduceDriver::SetReduceTensorDescriptorFromCmdLineArgs() { miopenReduceTensorOp_t reduceOp = static_cast(inflags.GetValueInt("ReduceOp")); miopenDataType_t compType = static_cast(inflags.GetValueInt("CompType")); miopenNanPropagation_t nanOpt = static_cast(inflags.GetValueInt("NanPropagation")); miopenReduceTensorIndices_t indicesOpt = static_cast(inflags.GetValueInt("IndicesUsed")); miopenIndicesType_t indicesType = MIOPEN_32BIT_INDICES; // no other place is better to place this line of codes this->need_indices = (indicesOpt == MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES) && (reduceOp == MIOPEN_REDUCE_TENSOR_MIN || reduceOp == MIOPEN_REDUCE_TENSOR_MAX || reduceOp == MIOPEN_REDUCE_TENSOR_AMAX); if(std::is_same::value) compType = miopenDouble; return (miopenSetReduceTensorDescriptor( reduceDesc, reduceOp, compType, nanOpt, indicesOpt, indicesType)); } template int ReduceDriver::AllocateBuffersAndCopy() { using reduce::convert_type; size_t in_nelem = GetTensorSize(inputTensor); size_t out_nelem = GetTensorSize(outputTensor); miopenGetReductionWorkspaceSize( GetHandle(), reduceDesc, inputTensor, outputTensor, &this->ws_sizeInBytes); miopenGetReductionIndicesSize( GetHandle(), reduceDesc, inputTensor, outputTensor, &this->indices_sizeInBytes); size_t ws_nelem = (!this->need_indices) ? this->ws_sizeInBytes / sizeof(Tgpu) : this->ws_sizeInBytes / (sizeof(Tgpu) + sizeof(int)); size_t indices_nelem = this->indices_sizeInBytes / sizeof(int); #if MIOPEN_BACKEND_OPENCL cl_context ctx; clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); #elif MIOPEN_BACKEND_HIP uint32_t ctx = 0; #endif in_dev = std::unique_ptr(new GPUMem(ctx, in_nelem, sizeof(Tgpu))); out_dev = std::unique_ptr(new GPUMem(ctx, out_nelem, sizeof(Tgpu))); ws_dev = this->need_indices ? std::unique_ptr(new GPUMem( ctx, ws_nelem * 2, std::max(sizeof(Tgpu), sizeof(int)))) : std::unique_ptr(new GPUMem(ctx, ws_nelem, sizeof(Tgpu))); indices_dev = std::unique_ptr(new GPUMem(ctx, indices_nelem, sizeof(int))); in = std::vector(in_nelem, convert_type(0.3f)); out = std::vector(out_nelem, convert_type(0.2f)); outhost = std::vector(out_nelem, convert_type(0.2f)); out_indices = std::vector(indices_nelem, static_cast(0)); outhost_indices = std::vector(indices_nelem, static_cast(0)); std::string inFileName = inflags.GetValueStr("in_data"); bool rdResult = false; if(!inFileName.empty()) rdResult = readBufferFromFile(in.data(), in.size(), inFileName.c_str()); if(!rdResult) { for(int i = 0; i < in_nelem; i++) { in[i] = RAN_GEN(convert_type(0.0f), convert_type(1.0f)); }; }; #if MIOPEN_BACKEND_OPENCL cl_int status; #elif MIOPEN_BACKEND_HIP int status; #endif status = in_dev->ToGPU(q, in.data()); status |= out_dev->ToGPU(q, out.data()); if(status != CL_SUCCESS) printf("Error copying data to GPU\n"); return miopenStatusSuccess; } template int ReduceDriver::RunForwardGPU() { auto alpha = static_cast(this->inflags.GetValueDouble("alpha")); auto beta = static_cast(this->inflags.GetValueDouble("beta")); if(this->need_indices) { alpha = 1.0f; beta = 0.0f; }; bool output_accumulate = !(reduce::float_equal_one(alpha) && reduce::float_equal_zero(beta)); const double alpha64 = alpha; const double beta64 = beta; const void* const alphaPtr = std::is_same::value ? static_cast(&alpha64) : static_cast(&alpha); const void* const betaPtr = std::is_same::value ? static_cast(&beta64) : static_cast(&beta); miopenReduceTensor(GetHandle(), reduceDesc, this->need_indices ? indices_dev->GetMem() : nullptr, // indices this->need_indices ? indices_sizeInBytes : 0, // indices size in bytes ws_sizeInBytes > 0 ? ws_dev->GetMem() : nullptr, // workspace ws_sizeInBytes, // workspace size in bytes alphaPtr, inputTensor, in_dev->GetMem(), betaPtr, outputTensor, out_dev->GetMem()); // must get the output here, since the host-based method only run once if(output_accumulate) { out_dev->FromGPU(GetStream(), out.data()); indices_dev->FromGPU(GetStream(), out_indices.data()); }; Timer t; START_TIME for(int i = 0; i < inflags.GetValueInt("iter"); i++) { miopenReduceTensor(GetHandle(), reduceDesc, this->need_indices ? indices_dev->GetMem() : nullptr, // indices this->need_indices ? indices_sizeInBytes : 0, // indices size in bytes ws_sizeInBytes > 0 ? ws_dev->GetMem() : nullptr, // workspace ws_sizeInBytes, // workspace size in bytes alphaPtr, inputTensor, in_dev->GetMem(), betaPtr, outputTensor, out_dev->GetMem()); } // for verifying correctness if(!output_accumulate) { out_dev->FromGPU(GetStream(), out.data()); indices_dev->FromGPU(GetStream(), out_indices.data()); }; if(inflags.GetValueInt("time") == 1) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); STOP_TIME if(WALL_CLOCK) printf("Wall-clock Time Reduction Elapsed: %f ms\n", t.gettime_ms() / inflags.GetValueInt("iter")); printf("GPU Kernel Time Reduction Elapsed: %f ms\n", time); } return miopenStatusSuccess; } template int ReduceDriver::RunForwardCPU() { return (0); } template int ReduceDriver::RunBackwardGPU() { return miopenStatusSuccess; } template int ReduceDriver::VerifyForward() { miopenReductionHost hostReduction(this->reduceDesc, this->inputTensor, this->outputTensor, this->dimsInvariant, this->dimsToReduce); auto alpha = static_cast(this->inflags.GetValueDouble("alpha")); auto beta = static_cast(this->inflags.GetValueDouble("beta")); auto reduceOp = static_cast(inflags.GetValueInt("ReduceOp")); if(indices_sizeInBytes > 0) { alpha = 1.0f; beta = 0.0f; }; hostReduction.Run(alpha, in.data(), beta, outhost.data(), outhost_indices.data()); auto error = miopen::rms_range(outhost, out); double tolerance = 1.5e-4; if(std::is_same::value) tolerance *= 4.0; if(std::is_same::value && reduceOp == MIOPEN_REDUCE_TENSOR_NORM2) tolerance *= 12.0; if(!std::isfinite(error) || error > tolerance) { std::cout << "ReduceTensor() FAILED with error = " << error << " , tolerance = " << tolerance << std::endl; } else { if(out_indices.size() > 0) { auto error2 = miopen::rms_range(outhost_indices, out_indices); if(!std::isfinite(error2) || std::abs(static_cast(error2)) != 0.0f) { std::cout << "ReduceTensor() with indices output FAILED: " << error2 << std::endl; } else { printf("ReduceTensor() with indices output Verifies on CPU and GPU (err=%f, " "err2=%f)\n", error, error2); }; } else { printf("ReduceTensor() Verifies on CPU and GPU (err=%f)\n", error); }; }; if(inflags.GetValueInt("dump_output")) { dumpBufferToFile("dump_in.bin", in.data(), in.size()); dumpBufferToFile("dump_out.bin", out.data(), out.size()); dumpBufferToFile("dump_outhost.bin", outhost.data(), outhost.size()); if(!out_indices.empty()) { dumpBufferToFile("dump_out_indices.bin", out_indices.data(), out_indices.size()); dumpBufferToFile( "dump_outhost_indices.bin", outhost_indices.data(), outhost_indices.size()); }; } return 0; } template int ReduceDriver::RunBackwardCPU() { return 0; } template int ReduceDriver::VerifyBackward() { return 0; } #endif // GUARD_MIOPEN_CONV_DRIVER_HPP