/******************************************************************************* * * MIT License * * Copyright (c) 2017 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * *******************************************************************************/ #ifndef GUARD_MIOPEN_CONV_DRIVER_HPP #define GUARD_MIOPEN_CONV_DRIVER_HPP /*#ifdef MIOPEN_NEURON_SOFTRELU /// \todo This needs to be explained or rewritten in clear manner. #undef MIOPEN_NEURON_SOFTRELU #endif #ifdef MIOPEN_NEURON_POWER /// \todo This needs to be explained or rewritten in clear manner. #undef MIOPEN_NEURON_POWER #endif */ #include "InputFlags.hpp" #include "conv_verify.hpp" #include "driver.hpp" #include "mloConvHost.hpp" #include "tensor_driver.hpp" #include "timer.hpp" #include "util_driver.hpp" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "random.hpp" #include #include #include #include #include #include #include <../test/verify.hpp> #include <../test/serialize.hpp> #include <../test/tensor_holder.hpp> #include <../test/cpu_conv.hpp> #include <../test/cpu_bias.hpp> #include // Declare hidden function for MIGraphX to smoke test it. extern "C" miopenStatus_t miopenHiddenSetConvolutionFindMode(miopenConvolutionDescriptor_t convDesc, int findMode); #define WORKAROUND_ISSUE_2176 1 // https://github.com/AMDComputeLibraries/MLOpen/issues/2176 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DRIVER_PAD_BUFFERS_2M) MIOPEN_DECLARE_ENV_VAR(MIOPEN_DRIVER_USE_GPU_REFERENCE) #if MIOPEN_BACKEND_OPENCL #define STATUS_SUCCESS CL_SUCCESS typedef cl_int status_t; #else // MIOPEN_BACKEND_HIP #define STATUS_SUCCESS 0 typedef int status_t; #endif struct AutoMiopenWarmupMode { AutoMiopenWarmupMode() { debug_logging_quiet_prev = miopen::debug::LoggingQuiet; debug_find_enforce_disable_prev = miopen::debug::FindEnforceDisable; miopen::debug::LoggingQuiet = true; miopen::debug::FindEnforceDisable = true; } AutoMiopenWarmupMode(const AutoMiopenWarmupMode&) = delete; AutoMiopenWarmupMode(AutoMiopenWarmupMode&&) = delete; AutoMiopenWarmupMode& operator=(const AutoMiopenWarmupMode&) = delete; AutoMiopenWarmupMode& operator=(AutoMiopenWarmupMode&&) = delete; ~AutoMiopenWarmupMode() { miopen::debug::LoggingQuiet = debug_logging_quiet_prev; miopen::debug::FindEnforceDisable = debug_find_enforce_disable_prev; } private: bool debug_logging_quiet_prev; bool debug_find_enforce_disable_prev; }; struct AutoConvDirectNaiveAlwaysEnable { AutoConvDirectNaiveAlwaysEnable() { prev = miopen::debug::AlwaysEnableConvDirectNaive; miopen::debug::AlwaysEnableConvDirectNaive = true; } ~AutoConvDirectNaiveAlwaysEnable() { miopen::debug::AlwaysEnableConvDirectNaive = prev; } private: bool prev; }; template void dumpBufferToFile(const char* fileName, T* data, size_t dataNumItems) { std::ofstream outFile(fileName, std::ios::binary); if(outFile) { outFile.write(reinterpret_cast(data), dataNumItems * sizeof(T)); outFile.close(); printf("Wrote output to file %s\n", fileName); } else { printf("Could not open file %s for writing\n", fileName); } } template bool readBufferFromFile(T* data, size_t dataNumItems, const char* fileName) { std::ifstream infile(fileName, std::ios::binary); if(infile) { infile.read(reinterpret_cast(data), dataNumItems * sizeof(T)); infile.close(); printf("Read data from input file %s\n", fileName); return true; } else { printf("Could not open file %s for reading\n", fileName); return false; } } // Tgpu and Tref are the data-type in GPU memory and CPU memory respectively. // They are not necessarily the same as the computation type on GPU or CPU template class ConvDriver : public Driver { #if MIOPEN_BACKEND_OPENCL typedef cl_context context_t; #elif MIOPEN_BACKEND_HIP typedef uint32_t context_t; #endif public: ConvDriver() : Driver() { miopenCreateTensorDescriptor(&inputTensor); miopenCreateTensorDescriptor(&weightTensor); miopenCreateTensorDescriptor(&outputTensor); miopenCreateTensorDescriptor(&biasTensor); miopenCreateTensorDescriptor(&inputTensor_vect4); miopenCreateTensorDescriptor(&weightTensor_vect4); miopenCreateConvolutionDescriptor(&convDesc); { AutoMiopenWarmupMode warmupMode; miopenCreateTensorDescriptor(&warmupInputTensor); miopenCreateTensorDescriptor(&warmupWeightTensor); miopenCreateTensorDescriptor(&warmupOutputTensor); miopenCreateConvolutionDescriptor(&warmupConvDesc); } workspace_dev = nullptr; // the variable name is implementation dependent, checking size instead InitDataType(); } int AddCmdLineArgs() override; int ParseCmdLineArgs(int argc, char* argv[]) override; InputFlags& GetInputFlags() override { return inflags; } // function to validate the Layout type parameters. // Layout types are -In,Out,Fil etc.This function validates the // layout parameter value to std (NCHW/NHWC/NCDHW/NDHWC) values, // defined in MIOpen lib. // layout_type - input value supplied with MIOpen driver command. void ValidateLayoutInputParameters(std::string layout_type); // Helper function to check the Layout type short names // Short names are defined as I,O,f. W.r.t In/Out/fil layout // types. int ChkLayout_ShortName(); int GetandSetData() override; std::vector GetInputTensorLengthsFromCmdLine(); std::vector GetWeightTensorLengthsFromCmdLine(); std::vector GetBiasTensorLengthsFromCmdLine(); int SetConvDescriptorFromCmdLineArgs(); std::vector GetOutputTensorLengths(); int AllocateBuffersAndCopy() override; bool UseGPUReference(); int FindForward(int& ret_algo_count, int request_algo_count, std::vector& perf_results, context_t ctx); int RunForwardGPU() override; int RunForwardCPU(); int RunForwardGPUReference(); int RunWarmupFindForwardGPU(); int FindBackwardData(int& ret_algo_count, int request_algo_count, std::vector& perf_results, context_t ctx); int FindBackwardWeights(int& ret_algo_count, int request_algo_count, std::vector& perf_results, context_t ctx); int RunBackwardGPU() override; int RunBackwardDataCPU(); int RunBackwardWeightsCPU(); int RunBackwardBiasCPU(); int RunBackwardDataGPUReference(); int RunBackwardWeightsGPUReference(); // int RunBackwardBiasGPUReference(); int VerifyBackward() override; int VerifyForward() override; ~ConvDriver() override { miopenDestroyTensorDescriptor(biasTensor); miopenDestroyTensorDescriptor(outputTensor); miopenDestroyTensorDescriptor(weightTensor); miopenDestroyTensorDescriptor(inputTensor); miopenDestroyTensorDescriptor(inputTensor_vect4); miopenDestroyTensorDescriptor(weightTensor_vect4); miopenDestroyConvolutionDescriptor(convDesc); miopenDestroyTensorDescriptor(warmupInputTensor); miopenDestroyTensorDescriptor(warmupWeightTensor); miopenDestroyTensorDescriptor(warmupOutputTensor); miopenDestroyConvolutionDescriptor(warmupConvDesc); } private: const miopenDataType_t warmup_data_type = miopenFloat; typedef float warmup_Tgpu; InputFlags inflags; boost::optional immediate_solution; miopenTensorDescriptor_t inputTensor; miopenTensorDescriptor_t weightTensor; miopenTensorDescriptor_t outputTensor; miopenTensorDescriptor_t biasTensor; miopenTensorDescriptor_t inputTensor_vect4; miopenTensorDescriptor_t weightTensor_vect4; miopenTensorDescriptor_t warmupInputTensor; miopenTensorDescriptor_t warmupWeightTensor; miopenTensorDescriptor_t warmupOutputTensor; std::unique_ptr in_dev; std::unique_ptr in_vect4_dev; std::unique_ptr din_dev; std::unique_ptr wei_dev; std::unique_ptr wei_vect4_dev; std::unique_ptr dwei_dev; std::unique_ptr out_dev; std::unique_ptr dout_dev; std::unique_ptr b_dev; std::unique_ptr db_dev; std::unique_ptr warmup_in_dev; std::unique_ptr warmup_wei_dev; std::unique_ptr warmup_out_dev; std::unique_ptr workspace_dev; std::size_t ws_sizeof_find_fwd; std::size_t ws_sizeof_find_bwd; std::size_t ws_sizeof_find_wrw; std::size_t warmup_ws_sizeof_find; tensor in; tensor wei; tensor out; tensor dout; tensor b; tensor outhost; tensor dwei_host; tensor din_host; tensor db_host; tensor warmup_in; tensor warmup_wei; tensor warmup_out; std::vector din; std::vector dwei; std::vector out_int8; std::vector db; std::vector b_int8; miopenConvolutionDescriptor_t convDesc; miopenConvolutionDescriptor_t warmupConvDesc; bool is_wrw = true, is_bwd = true, is_fwd = true; bool is_wrw_winograd = false; bool is_wrw_igemm = false; bool is_fwd_igemm = false; bool is_bwd_igemm = false; bool time_enabled = false; bool wall_enabled = false; bool warmup_enabled = false; int num_iterations = 1; // Used to avoid wasting time for verification after failure of Run*GPU(). // We can't properly control this from the main() level. // RunBackwardGPU() and VerifyBackward() do the job for both Bwd and WrW. // If RunBackwardGPU() fails, then main() doesn't know if Bwd or WrW has failed. // Also main() has no ways to for controlling how Verify works except skipping the whole call. bool is_fwd_run_failed = false, is_bwd_run_failed = false, is_wrw_run_failed = false; Timer wall; Timer2 fwd_auxiliary; Timer2 bwd_auxiliary; Timer2 wrw_auxiliary; Timer2 fwd_auxiliary_gwss; Timer2 bwd_auxiliary_gwss; Timer2 wrw_auxiliary_gwss; Timer2 warmup_wall_total; // Counts also auxiliary time. void PrintForwardTime(float kernel_total_time, float kernel_first_time) const; int RunForwardGpuImmed(bool is_transform); int RunForwardGpuFind(bool is_transform); void PrintBackwardDataTime(float kernel_total_time, float kernel_first_time); int RunBackwardDataGpuImmed(); int RunBackwardDataGpuFind(); void PrintBackwardWrwTime(float kernel_total_time, float kernel_first_time); int RunBackwardWrwGpuImmed(); int RunBackwardWrwGpuFind(); double GetDefaultTolerance() const { // Computation error of fp16 is ~2^13 (=8192) bigger than // the one of fp32 because mantissa is shorter by 13 bits. auto tolerance = (sizeof(Tgpu) == 4 || sizeof(Tgpu) == 1) ? 1.5e-6 : 8.2e-3; // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. if(std::is_same::value) tolerance *= 8.0; return tolerance; } enum class Direction { Fwd, Bwd, WrW, BwdBias }; std::string GetVerificationCacheFileName(const Direction& direction) const; bool IsInputTensorTransform() const; bool TryReadVerificationCache(const Direction& direction, miopenTensorDescriptor_t& tensorDesc, Tref* data) const; void TrySaveVerificationCache(const Direction& direction, std::vector& data) const; void ResizeWorkspaceDev(context_t ctx, std::size_t size) { workspace_dev.reset(); if(size > 0) workspace_dev = std::unique_ptr(new GPUMem(ctx, size, 1)); } // Helper functions, can be moved out of class. void PrintImmedSolutionInfo(const miopenConvSolution_t& s) const { std::cout << "- id: " << s.solution_id << " algo: " << s.algorithm << ", time: " << s.time << " ms, ws: " << s.workspace_size << ", name: " << miopen::solver::Id(s.solution_id).ToString() << std::endl; } std::string AlgorithmSolutionToString(const miopenConvSolution_t& s) const { std::ostringstream oss; oss << "Algorithm: " << s.algorithm << ", Solution: " << s.solution_id << '/' << ((s.solution_id != 0) ? miopen::solver::Id(s.solution_id).ToString() : std::string("UNKNOWN")); return oss.str(); } /// Find() updates find-db with the most recent information (unless find-db is disabled). /// Therefore, after Find(), Immediate mode returns the "best" found solution /// as the 1st solution in the list, and we can use Immediate mode to find out /// the name of the Solver selected during Find() and then used in Run(). void GetSolutionAfterFind(const miopenConvAlgoPerf_t& found, const Direction& direction, const miopenTensorDescriptor_t& inTensor, const miopenTensorDescriptor_t& weiTensor, const miopenTensorDescriptor_t& outTensor, miopenConvSolution_t& solution); }; // Check if int8 type tensor x and w need to be transformed to a pack of 4 elements along channel // (NCHW_VECT_C format) template bool ConvDriver::IsInputTensorTransform() const { return (data_type == miopenInt8 && inflags.GetValueInt("in_channels") % 4 != 0) || data_type == miopenInt8x4; } template int ConvDriver::ParseCmdLineArgs(int argc, char* argv[]) { inflags.Parse(argc, argv); // try to set a default layout value for 3d conv if not specified from cmd line int spatial_dim = inflags.GetValueInt("spatial_dim"); const std::string default_layout = (spatial_dim == 2) ? "NCHW" : "NCDHW"; // inflags value is empty, default value is used // if it is supplied via cmd line, check the value. if(inflags.GetValueStr("in_layout").empty()) { inflags.SetValue("in_layout", default_layout); } else { std::string in_layoutValue = inflags.GetValueStr("in_layout"); ValidateLayoutInputParameters(in_layoutValue); } // fil layout argument value check if(inflags.GetValueStr("fil_layout").empty()) { inflags.SetValue("fil_layout", default_layout); } else { std::string fil_layoutValue = inflags.GetValueStr("fil_layout"); ValidateLayoutInputParameters(fil_layoutValue); } // out layout argument check if(inflags.GetValueStr("out_layout").empty()) { inflags.SetValue("out_layout", default_layout); } else { std::string out_layoutValue = inflags.GetValueStr("out_layout"); ValidateLayoutInputParameters(out_layoutValue); } num_iterations = inflags.GetValueInt("iter"); if(num_iterations < 1) { std::cout << "Fatal: Number of iterations must be > 0: " << num_iterations << std::endl; return 1; } time_enabled = (inflags.GetValueInt("time") != 0); { const int val = inflags.GetValueInt("wall"); if(val >= 1) { if(!time_enabled) { std::cout << "Info: '--wall " << val << "' is ignored because '--time' is not set" << std::endl; } else { wall_enabled = (val >= 1); warmup_enabled = (val >= 2); } } } if(time_enabled) { miopenEnableProfiling(GetHandle(), true); } is_fwd = (inflags.GetValueInt("forw") == 0 || inflags.GetValueInt("forw") & 1); is_bwd = (inflags.GetValueInt("forw") == 0 || inflags.GetValueInt("forw") & 2); is_wrw = (inflags.GetValueInt("forw") == 0 || inflags.GetValueInt("forw") & 4); const auto solution_str = inflags.GetValueStr("solution"); auto solution_value = static_cast(miopen::solver::Id(solution_str.c_str()).Value()); if(solution_value == 0) // Assume number on input { solution_value = std::strtol(solution_str.c_str(), nullptr, 10); if(errno == ERANGE) { errno = 0; solution_value = 0; } } if(solution_value >= 0) immediate_solution = solution_value; return 0; } template void ConvDriver::ValidateLayoutInputParameters(std::string layout_value) { if((ChkLayout_ShortName())) { std::cerr << " Invalid Layout Short Name = " << ChkLayout_ShortName() << std::endl; exit(EXIT_FAILURE); } else { if((layout_value.compare("NCHW") == 0) || (layout_value.compare("NHWC") == 0) || (layout_value.compare("NCDHW") == 0) || (layout_value.compare("NDHWC") == 0)) { // do nothing,Values are matching as defined in Lib. } else { std::cerr << "Invalid Layout Parameter Value - " << layout_value << std::endl; exit(EXIT_FAILURE); } } } template int ConvDriver::ChkLayout_ShortName() { // check for short name of layout type if((inflags.FindShortName("in_layout") == 'I') && (inflags.FindShortName("out_layout") == 'O') && (inflags.FindShortName("fil_layout") == 'f')) { // do noting // found valid short names return 0; } else { std::cerr << "Error:Invalid Short Name!" << std::endl; exit(EXIT_FAILURE); } } template int ConvDriver::GetandSetData() { std::vector in_len = GetInputTensorLengthsFromCmdLine(); std::vector wei_len = GetWeightTensorLengthsFromCmdLine(); SetTensorNd(inputTensor, in_len, inflags.GetValueStr("in_layout"), data_type); SetTensorNd(weightTensor, wei_len, inflags.GetValueStr("fil_layout"), data_type); if(inflags.GetValueInt("tensor_vect") == 1 && data_type == miopenInt8) { data_type = miopenInt8x4; } if(IsInputTensorTransform()) { std::vector in_len_vect4(in_len.begin(), in_len.end()), wei_len_vect4(wei_len.begin(), wei_len.end()); in_len_vect4[1] = ((in_len[1] + 3) / 4) * 4; SetTensorNd(inputTensor_vect4, in_len_vect4, data_type); wei_len_vect4[1] = ((wei_len[1] + 3) / 4) * 4; SetTensorNd(weightTensor_vect4, wei_len_vect4, data_type); } SetConvDescriptorFromCmdLineArgs(); std::vector out_len = GetOutputTensorLengths(); miopenDataType_t y_type = (data_type == miopenInt8 || data_type == miopenInt8x4) ? miopenFloat : data_type; SetTensorNd(outputTensor, out_len, inflags.GetValueStr("out_layout"), y_type); if(inflags.GetValueInt("bias") != 0) { std::vector bias_len = GetBiasTensorLengthsFromCmdLine(); SetTensorNd(biasTensor, bias_len, data_type); } if(warmup_enabled) { AutoMiopenWarmupMode warmupMode; std::vector warmup_in_len = {1, 1, 16, 16}; // NCHW std::vector warmup_wei_len = {1, 1, 1, 1}; // KCYX SetTensorNd(warmupInputTensor, warmup_in_len, warmup_data_type); SetTensorNd(warmupWeightTensor, warmup_wei_len, warmup_data_type); const int spatial_dim = 2; const int group_count = 1; std::vector pads = {0, 0}; std::vector conv_strides = {1, 1}; std::vector conv_dilations = {1, 1}; miopenConvolutionMode_t mode = miopenConvolution; miopenInitConvolutionNdDescriptor(warmupConvDesc, spatial_dim, pads.data(), conv_strides.data(), conv_dilations.data(), mode); miopenSetConvolutionFindMode(warmupConvDesc, miopenConvolutionFindModeNormal); miopenHiddenSetConvolutionFindMode( warmupConvDesc, static_cast(miopenConvolutionFindModeNormal)); // Repeat via hidden API. miopenSetConvolutionGroupCount(warmupConvDesc, group_count); int warmup_out_len_size = miopen::deref(warmupInputTensor).GetSize(); std::vector warmup_out_len(warmup_out_len_size); miopenGetConvolutionNdForwardOutputDim(warmupConvDesc, warmupInputTensor, warmupWeightTensor, &warmup_out_len_size, warmup_out_len.data()); SetTensorNd(warmupOutputTensor, warmup_out_len, warmup_data_type); } return (0); } template int ConvDriver::AddCmdLineArgs() { inflags.AddInputFlag("in_layout", 'I', "", "Input Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)", "string", true); inflags.AddInputFlag("out_layout", 'O', "", "Output Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)", "string", true); inflags.AddInputFlag("fil_layout", 'f', "", "Filter Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)", "string", true); inflags.AddInputFlag( "spatial_dim", '_', "2", "convolution spatial dimension (Default-2)", "int"); inflags.AddInputFlag("forw", 'F', "0", "Flag enables fwd, bwd, wrw convolutions" "\n0 fwd+bwd+wrw (default)" "\n1 fwd only" "\n2 bwd only" "\n4 wrw only" "\n3 fwd+bwd" "\n5 fwd+wrw" "\n6 bwd+wrw", "int"); inflags.AddInputFlag("batchsize", 'n', "100", "Mini-batch size (Default=100)", "int"); inflags.AddInputFlag("in_channels", 'c', "3", "Number of Input Channels (Default=3)", "int"); inflags.AddInputFlag("in_d", '!', "32", "Input Depth (Default=32)", "int"); inflags.AddInputFlag("in_h", 'H', "32", "Input Height (Default=32)", "int"); inflags.AddInputFlag("in_w", 'W', "32", "Input Width (Default=32)", "int"); inflags.AddInputFlag( "out_channels", 'k', "32", "Number of Output Channels (Default=32)", "int"); inflags.AddInputFlag("fil_d", '@', "3", "Filter Depth (Default=3)", "int"); inflags.AddInputFlag("fil_h", 'y', "3", "Filter Height (Default=3)", "int"); inflags.AddInputFlag("fil_w", 'x', "3", "Filter Width (Default=3)", "int"); inflags.AddInputFlag( "conv_stride_d", '#', "1", "Convolution Stride for Depth (Default=1)", "int"); inflags.AddInputFlag( "conv_stride_h", 'u', "1", "Convolution Stride for Height (Default=1)", "int"); inflags.AddInputFlag( "conv_stride_w", 'v', "1", "Convolution Stride for Width (Default=1)", "int"); inflags.AddInputFlag("pad_d", '$', "0", "Zero Padding for Depth (Default=0)", "int"); inflags.AddInputFlag("pad_h", 'p', "0", "Zero Padding for Height (Default=0)", "int"); inflags.AddInputFlag("pad_w", 'q', "0", "Zero Padding for Width (Default=0)", "int"); inflags.AddInputFlag("pad_val", 'r', "0", "Padding Value (Default=0)", "int"); inflags.AddInputFlag( "trans_output_pad_d", '%', "0", "Zero Padding Output for Depth (Default=0)", "int"); inflags.AddInputFlag( "trans_output_pad_h", 'Y', "0", "Zero Padding Output for Height (Default=0)", "int"); inflags.AddInputFlag( "trans_output_pad_w", 'X', "0", "Zero Padding Output for Width (Default=0)", "int"); inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int"); inflags.AddInputFlag("verification_cache", 'C', "", "Use specified directory to cache verification data. Off by default.", "string"); inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); inflags.AddInputFlag("wall", 'w', "0", "Wall-clock Time Each Layer" "\n0 Off (Default)" "\n1 On, requires '--time 1')" "\n2 On, warm-up the library (prefetch db caches), requires '--time 1')", "int"); inflags.AddInputFlag("search", 's', "0", "Search Kernel Config (Default=0)", "int"); inflags.AddInputFlag("printconv", 'P', "1", "Print Convolution Dimensions (Default=1)", "int"); inflags.AddInputFlag("dump_output", 'o', "0", "Dumps the output buffers (Default=0)", "int"); inflags.AddInputFlag("in_data", 'd', "", "Input data filename (Default=)", "string"); inflags.AddInputFlag("weights", 'e', "", "Input weights filename (Default=)", "string"); inflags.AddInputFlag("bias", 'b', "", "Use Bias (Default=0)", "int"); inflags.AddInputFlag( "mode", 'm', "conv", "Convolution Mode (conv, trans) (Default=conv)", "str"); inflags.AddInputFlag( "pad_mode", 'z', "default", "Padding Mode (same, valid, default) (Default=default)", "str"); inflags.AddInputFlag("tensor_vect", 'Z', "0", "tensor vectorization type (none, vect_c, vect_n) (Default=0)", "int"); inflags.AddInputFlag("dilation_d", '^', "1", "Dilation of Filter Depth (Default=1)", "int"); inflags.AddInputFlag("dilation_h", 'l', "1", "Dilation of Filter Height (Default=1)", "int"); inflags.AddInputFlag("dilation_w", 'j', "1", "Dilation of Filter Width (Default=1)", "int"); inflags.AddInputFlag("in_bias", 'a', "", "Input bias filename (Default=)", "string"); inflags.AddInputFlag("group_count", 'g', "1", "Number of Groups (Default=1)", "int"); inflags.AddInputFlag("dout_data", 'D', "", "dy data filename for backward weight computation (Default=)", "string"); inflags.AddInputFlag("solution", 'S', "-1", "Use immediate mode, run solution with specified id." "\nAccepts integer argument N:" "\n=0 Immediate mode, build and run fastest solution" "\n>0 Immediate mode, build and run solution_id = N" "\n<0 Use Find() API (Default=-1)" "\nAlso accepts symbolic name of solution:" "\n Immediate mode, build and run specified solution" "\n Use Find() API", "string"); return 0; } template std::vector ConvDriver::GetInputTensorLengthsFromCmdLine() { std::vector in_lens; int spatial_dim = inflags.GetValueInt("spatial_dim"); in_lens.resize(2 + spatial_dim); in_lens[0] = inflags.GetValueInt("batchsize"); in_lens[1] = inflags.GetValueInt("in_channels"); auto in_spatial_lens = boost::adaptors::slice(in_lens, 2, 2 + spatial_dim); if(spatial_dim == 2) { in_spatial_lens[0] = inflags.GetValueInt("in_h"); in_spatial_lens[1] = inflags.GetValueInt("in_w"); } else if(spatial_dim == 3) { in_spatial_lens[0] = inflags.GetValueInt("in_d"); in_spatial_lens[1] = inflags.GetValueInt("in_h"); in_spatial_lens[2] = inflags.GetValueInt("in_w"); } else { MIOPEN_THROW("unsupported convolution dimension"); } return in_lens; } template std::vector ConvDriver::GetWeightTensorLengthsFromCmdLine() { std::vector wei_lens; int spatial_dim = inflags.GetValueInt("spatial_dim"); wei_lens.resize(2 + spatial_dim); auto wei_spatial_lens = boost::adaptors::slice(wei_lens, 2, 2 + spatial_dim); int group_count = std::max(inflags.GetValueInt("group_count"), 1); int wei_k_len = inflags.GetValueInt("out_channels"); int wei_c_len = inflags.GetValueInt("in_channels"); if(spatial_dim == 2) { wei_spatial_lens[0] = inflags.GetValueInt("fil_h"); wei_spatial_lens[1] = inflags.GetValueInt("fil_w"); } else if(spatial_dim == 3) { wei_spatial_lens[0] = inflags.GetValueInt("fil_d"); wei_spatial_lens[1] = inflags.GetValueInt("fil_h"); wei_spatial_lens[2] = inflags.GetValueInt("fil_w"); } else { MIOPEN_THROW("unsupported convolution dimension"); } if(group_count > 1) { if(wei_c_len % group_count != 0 || wei_k_len % group_count != 0 || group_count > wei_c_len || group_count > wei_k_len) { MIOPEN_THROW("Invalid group number\n"); } } miopenConvolutionMode_t mode; if((inflags.GetValueStr("mode")) == "conv") { mode = miopenConvolution; } else if((inflags.GetValueStr("mode")) == "trans") { mode = miopenTranspose; } else { MIOPEN_THROW("Incorrect Convolution Mode\n"); } if(mode == miopenTranspose) { wei_lens[0] = wei_c_len; wei_lens[1] = wei_k_len / group_count; } else { wei_lens[0] = wei_k_len; wei_lens[1] = wei_c_len / group_count; } return wei_lens; } template std::vector ConvDriver::GetBiasTensorLengthsFromCmdLine() { int spatial_dim = inflags.GetValueInt("spatial_dim"); std::vector bias_lens(2 + spatial_dim, 1); bias_lens[1] = inflags.GetValueInt("out_channels"); return bias_lens; } template int ConvDriver::SetConvDescriptorFromCmdLineArgs() { int spatial_dim = inflags.GetValueInt("spatial_dim"); std::vector in_spatial_lens(spatial_dim); std::vector wei_spatial_lens(spatial_dim); std::vector pads(spatial_dim); std::vector conv_strides(spatial_dim); std::vector conv_dilations(spatial_dim); std::vector trans_output_pads(spatial_dim); if(spatial_dim == 2) { in_spatial_lens[0] = inflags.GetValueInt("in_h"); in_spatial_lens[1] = inflags.GetValueInt("in_w"); wei_spatial_lens[0] = inflags.GetValueInt("fil_h"); wei_spatial_lens[1] = inflags.GetValueInt("fil_w"); pads[0] = inflags.GetValueInt("pad_h"); pads[1] = inflags.GetValueInt("pad_w"); conv_strides[0] = inflags.GetValueInt("conv_stride_h"); conv_strides[1] = inflags.GetValueInt("conv_stride_w"); conv_dilations[0] = inflags.GetValueInt("dilation_h"); conv_dilations[1] = inflags.GetValueInt("dilation_w"); trans_output_pads[0] = inflags.GetValueInt("trans_output_pad_h"); trans_output_pads[1] = inflags.GetValueInt("trans_output_pad_w"); } else if(spatial_dim == 3) { in_spatial_lens[0] = inflags.GetValueInt("in_d"); in_spatial_lens[1] = inflags.GetValueInt("in_h"); in_spatial_lens[2] = inflags.GetValueInt("in_w"); wei_spatial_lens[0] = inflags.GetValueInt("fil_d"); wei_spatial_lens[1] = inflags.GetValueInt("fil_h"); wei_spatial_lens[2] = inflags.GetValueInt("fil_w"); pads[0] = inflags.GetValueInt("pad_d"); pads[1] = inflags.GetValueInt("pad_h"); pads[2] = inflags.GetValueInt("pad_w"); conv_strides[0] = inflags.GetValueInt("conv_stride_d"); conv_strides[1] = inflags.GetValueInt("conv_stride_h"); conv_strides[2] = inflags.GetValueInt("conv_stride_w"); conv_dilations[0] = inflags.GetValueInt("dilation_d"); conv_dilations[1] = inflags.GetValueInt("dilation_h"); conv_dilations[2] = inflags.GetValueInt("dilation_w"); trans_output_pads[0] = inflags.GetValueInt("trans_output_pad_d"); trans_output_pads[1] = inflags.GetValueInt("trans_output_pad_h"); trans_output_pads[2] = inflags.GetValueInt("trans_output_pad_w"); } else { MIOPEN_THROW("unsupported convolution dimension"); } int out_c = inflags.GetValueInt("out_channels"); int in_c = inflags.GetValueInt("in_channels"); int group_count = std::max(inflags.GetValueInt("group_count"), 1); if(group_count > 1) { if(in_c % group_count != 0 || out_c % group_count != 0 || group_count > in_c || group_count > out_c) { printf("Invalid group number\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } } miopenConvolutionMode_t mode; if((inflags.GetValueStr("mode")) == "conv") { mode = miopenConvolution; } else if((inflags.GetValueStr("mode")) == "trans") { mode = miopenTranspose; } else { printf("Incorrect Convolution Mode\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } // adjust padding based on user-defined padding mode if(mode == miopenConvolution && (miopen::all_of(conv_dilations, [](auto v) { return v == 1; }) || miopen::all_of(wei_spatial_lens, [](auto v) { return v == 1; }))) { if((inflags.GetValueStr("pad_mode")) == "same") { for(int i = 0; i < spatial_dim; ++i) { pads[i] = (in_spatial_lens[i] % conv_strides[i] == 0) ? (std::max((wei_spatial_lens[i] - conv_strides[i]), 0)) : (std::max((wei_spatial_lens[i] - (in_spatial_lens[i] % conv_strides[i])), 0)); pads[i] /= 2; } } else if((inflags.GetValueStr("pad_mode")) == "valid") { for(int i = 0; i < spatial_dim; ++i) { pads[i] = 0; } } } miopenInitConvolutionNdDescriptor( convDesc, spatial_dim, pads.data(), conv_strides.data(), conv_dilations.data(), mode); miopenSetConvolutionGroupCount(convDesc, group_count); if(mode == miopenTranspose) { miopenSetTransposeConvNdOutputPadding(convDesc, spatial_dim, trans_output_pads.data()); } return miopenStatusSuccess; } template std::vector ConvDriver::GetOutputTensorLengths() { int ndim = miopen::deref(inputTensor).GetSize(); std::vector out_lens(ndim); miopenGetConvolutionNdForwardOutputDim( convDesc, inputTensor, weightTensor, &ndim, out_lens.data()); return out_lens; } namespace detail { template T RanGenWeights() { return RAN_GEN(static_cast(-0.5), static_cast(0.5)); } // Shift FP16 distribution towards positive numbers, // otherwise Winograd FP16 validation fails. template <> float16 RanGenWeights() { return RAN_GEN(static_cast(-1.0 / 3.0), static_cast(0.5)); } } // namespace detail template int ConvDriver::AllocateBuffersAndCopy() { if(wall_enabled) { fwd_auxiliary.start(); fwd_auxiliary.pause(); bwd_auxiliary.start(); bwd_auxiliary.pause(); wrw_auxiliary.start(); wrw_auxiliary.pause(); fwd_auxiliary_gwss.start(); fwd_auxiliary_gwss.pause(); bwd_auxiliary_gwss.start(); bwd_auxiliary_gwss.pause(); wrw_auxiliary_gwss.start(); wrw_auxiliary_gwss.pause(); if(warmup_enabled) { warmup_wall_total.start(); warmup_wall_total.pause(); } } bool is_transform = IsInputTensorTransform(); bool is_int8 = data_type == miopenInt8 || data_type == miopenInt8x4; size_t in_sz = GetTensorSize(inputTensor); size_t wei_sz = GetTensorSize(weightTensor); size_t out_sz = GetTensorSize(outputTensor); // Workaround: Pad buffers allocations to be a multiple of 2M if(miopen::IsEnabled(MIOPEN_DRIVER_PAD_BUFFERS_2M{})) { // PadBufferSize(in_sz, sizeof(Tgpu)); PadBufferSize(wei_sz, sizeof(Tgpu)); PadBufferSize(out_sz, sizeof(Tgpu)); } #if MIOPEN_BACKEND_OPENCL cl_context ctx; clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); #elif MIOPEN_BACKEND_HIP uint32_t ctx = 0; #endif ws_sizeof_find_fwd = 0; ws_sizeof_find_wrw = 0; ws_sizeof_find_bwd = 0; if(warmup_enabled) { do { AutoMiopenWarmupMode warmupMode; size_t warmup_in_sz = GetTensorSize(warmupInputTensor); size_t warmup_wei_sz = GetTensorSize(warmupWeightTensor); size_t warmup_out_sz = GetTensorSize(warmupOutputTensor); if(miopen::IsEnabled(MIOPEN_DRIVER_PAD_BUFFERS_2M{})) { PadBufferSize(warmup_wei_sz, sizeof(warmup_Tgpu)); PadBufferSize(warmup_out_sz, sizeof(warmup_Tgpu)); } warmup_ws_sizeof_find = 0; warmup_wall_total.resume(wall_enabled); miopenStatus_t rc = miopenConvolutionForwardGetWorkSpaceSize(GetHandle(), warmupWeightTensor, warmupInputTensor, warmupConvDesc, warmupOutputTensor, &warmup_ws_sizeof_find); warmup_wall_total.pause(wall_enabled); if(rc != miopenStatusSuccess) { std::cout << "Warm-up: Error getting workspace size, status = " << rc << ". Warm-up disabled." << std::endl; warmup_enabled = false; break; } if(warmup_ws_sizeof_find != 0) { std::cout << "Warm-up: This step should not require workspace, but asks for " << warmup_ws_sizeof_find << ". Warm-up disabled." << std::endl; warmup_enabled = false; break; } warmup_in = tensor(miopen::deref(warmupInputTensor).GetLengths(), miopen::deref(warmupInputTensor).GetStrides()); warmup_wei = tensor(miopen::deref(warmupWeightTensor).GetLengths(), miopen::deref(warmupWeightTensor).GetStrides()); warmup_out = tensor(miopen::deref(warmupOutputTensor).GetLengths(), miopen::deref(warmupOutputTensor).GetStrides()); warmup_in_dev = std::unique_ptr(new GPUMem(ctx, warmup_in_sz, sizeof(warmup_Tgpu))); warmup_wei_dev = std::unique_ptr(new GPUMem(ctx, warmup_wei_sz, sizeof(warmup_Tgpu))); warmup_out_dev = std::unique_ptr(new GPUMem(ctx, warmup_out_sz, sizeof(warmup_Tgpu))); status_t status = STATUS_SUCCESS; status |= warmup_in_dev->ToGPU(q, warmup_in.data.data()); status |= warmup_wei_dev->ToGPU(q, warmup_wei.data.data()); status |= warmup_out_dev->ToGPU(q, warmup_out.data.data()); if(status != STATUS_SUCCESS) { std::cout << "Warm-up: Error copying data to GPU, status = " << status << ". Warm-up disabled." << std::endl; warmup_enabled = false; break; } const int rcf = RunWarmupFindForwardGPU(); if(rcf != 0) { std::cout << "Warm-up: RunWarmupFindForwardGPU() failed, rcf = " << rcf << ". Warm-up disabled." << std::endl; warmup_enabled = false; break; } } while(false); } if(!immediate_solution) { miopenStatus_t rc = miopenStatusSuccess; if(is_wrw && rc == miopenStatusSuccess) { wrw_auxiliary.resume(wall_enabled); wrw_auxiliary_gwss.resume(wall_enabled); rc = miopenConvolutionBackwardWeightsGetWorkSpaceSize(GetHandle(), outputTensor, inputTensor, convDesc, weightTensor, &ws_sizeof_find_wrw); wrw_auxiliary_gwss.pause(wall_enabled); wrw_auxiliary.pause(wall_enabled); } if(is_bwd && rc == miopenStatusSuccess) { bwd_auxiliary.resume(wall_enabled); bwd_auxiliary_gwss.resume(wall_enabled); rc = miopenConvolutionBackwardDataGetWorkSpaceSize(GetHandle(), outputTensor, weightTensor, convDesc, inputTensor, &ws_sizeof_find_bwd); bwd_auxiliary_gwss.pause(wall_enabled); bwd_auxiliary.pause(wall_enabled); } if(is_fwd && rc == miopenStatusSuccess) { fwd_auxiliary.resume(wall_enabled); fwd_auxiliary_gwss.resume(wall_enabled); rc = miopenConvolutionForwardGetWorkSpaceSize( GetHandle(), (is_transform ? weightTensor_vect4 : weightTensor), (is_transform ? inputTensor_vect4 : inputTensor), convDesc, outputTensor, &ws_sizeof_find_fwd); fwd_auxiliary_gwss.pause(wall_enabled); fwd_auxiliary.pause(wall_enabled); } if(rc != miopenStatusSuccess) { std::cout << "Error getting workspace size, status = " << rc << std::endl; return rc; } } if(is_fwd || is_wrw) in = tensor(miopen::deref(inputTensor).GetLengths(), miopen::deref(inputTensor).GetStrides()); if(is_fwd || is_bwd) wei = tensor(miopen::deref(weightTensor).GetLengths(), miopen::deref(weightTensor).GetStrides()); if(is_fwd) out = tensor(miopen::deref(outputTensor).GetLengths(), miopen::deref(outputTensor).GetStrides()); if(is_bwd || is_wrw) dout = tensor(miopen::deref(outputTensor).GetLengths(), miopen::deref(outputTensor).GetStrides()); if(is_bwd) din = std::vector(in_sz, static_cast(0)); if(is_wrw) dwei = std::vector(wei_sz, static_cast(0)); if(is_int8) out_int8 = std::vector(out_sz, static_cast(0)); if(is_transform) { in_vect4_dev = std::unique_ptr( new GPUMem(ctx, GetTensorSize(inputTensor_vect4), sizeof(Tgpu))); wei_vect4_dev = std::unique_ptr( new GPUMem(ctx, GetTensorSize(weightTensor_vect4), sizeof(Tgpu))); } outhost = tensor(miopen::deref(outputTensor).GetLengths(), miopen::deref(outputTensor).GetStrides()); din_host = tensor(miopen::deref(inputTensor).GetLengths(), miopen::deref(inputTensor).GetStrides()); dwei_host = tensor(miopen::deref(weightTensor).GetLengths(), miopen::deref(weightTensor).GetStrides()); std::string inFileName = inflags.GetValueStr("in_data"); std::string weiFileName = inflags.GetValueStr("weights"); std::string biasFileName = inflags.GetValueStr("in_bias"); std::string doutFileName = inflags.GetValueStr("dout_data"); /* Unless seed is persistent between runs validation using cache stored in file is impossible. */ srand(0); bool dataRead = false; if(is_fwd || is_wrw) if(!inFileName.empty()) dataRead = readBufferFromFile(in.data.data(), in_sz, inFileName.c_str()); bool weiRead = false; if(is_fwd || is_bwd) if(!weiFileName.empty()) weiRead = readBufferFromFile(wei.data.data(), wei_sz, weiFileName.c_str()); if(is_int8) { float Data_scale = 127.0; if(!dataRead) { for(int i = 0; i < in_sz; i++) { if(is_fwd || is_wrw) in.data[i] = static_cast(Data_scale * RAN_GEN(static_cast(0.0), static_cast(1.0))); else /// \anchor move_rand /// Move rand() forward, even if buffer is unused. This provides the same /// initialization of input buffers regardless of which kinds of /// convolutions are currently selectedfor testing (see the "-F" option). /// Verification cache would be broken otherwise. GET_RAND(); } } if(inflags.GetValueInt("bias") != 0) { size_t b_sz = GetTensorSize(biasTensor); b_dev = std::unique_ptr(new GPUMem(ctx, b_sz, sizeof(float))); b_int8 = std::vector(b_sz, static_cast(0)); for(int i = 0; i < b_sz; i++) { b_int8[i] = static_cast(i % 8) + RAN_GEN(static_cast(0.0), static_cast(1.0)); } if(!biasFileName.empty()) { readBufferFromFile(b_int8.data(), b_sz, biasFileName.c_str()); } b_dev->ToGPU(q, b_int8.data()); } if(!weiRead) { for(int i = 0; i < wei_sz; i++) if(is_fwd || is_bwd) wei.data[i] = static_cast(Data_scale * 2 * detail::RanGenWeights()); else /// \ref move_rand GET_RAND(); } } else { Tgpu Data_scale = static_cast(0.01); bool doutRead = false; if(is_bwd || is_wrw) if(!doutFileName.empty()) doutRead = readBufferFromFile(dout.data.data(), out_sz, doutFileName.c_str()); if(!dataRead) { for(int i = 0; i < in_sz; i++) { if(is_fwd || is_wrw) in.data[i] = Data_scale * RAN_GEN(static_cast(0.0), static_cast(1.0)); else /// \ref move_rand GET_RAND(); } } if(!doutRead) { for(int i = 0; i < out_sz; i++) if(is_bwd || is_wrw) dout.data[i] = Data_scale * RAN_GEN(static_cast(0.0), static_cast(1.0)); else /// \ref move_rand GET_RAND(); } if(inflags.GetValueInt("bias") != 0) { size_t b_sz = GetTensorSize(biasTensor); b_dev = std::unique_ptr(new GPUMem(ctx, b_sz, sizeof(Tgpu))); db_dev = std::unique_ptr(new GPUMem(ctx, b_sz, sizeof(Tgpu))); b = tensor(miopen::deref(biasTensor).GetLengths(), miopen::deref(biasTensor).GetStrides()); db = std::vector(b_sz, static_cast(0)); db_host = tensor(miopen::deref(biasTensor).GetLengths(), miopen::deref(biasTensor).GetStrides()); for(int i = 0; i < b_sz; i++) { b.data[i] = static_cast(i % 8) + RAN_GEN(static_cast(0.0), static_cast(1.0)); db[i] = static_cast(i % 8) + RAN_GEN(static_cast(0.0), static_cast(1.0)); } if(!biasFileName.empty()) { readBufferFromFile(b.data.data(), b_sz, biasFileName.c_str()); } b_dev->ToGPU(q, b.data.data()); db_dev->ToGPU(q, db.data()); } if(!weiRead) { for(int i = 0; i < wei_sz; i++) if(is_fwd || is_bwd) wei.data[i] = Data_scale * detail::RanGenWeights(); else /// \ref move_rand GET_RAND(); } } if(inflags.GetValueInt("dump_output")) { if(is_fwd || is_wrw) dumpBufferToFile("dump_in.bin", in.data.data(), in_sz); if(is_fwd || is_bwd) dumpBufferToFile("dump_wei.bin", wei.data.data(), wei_sz); if(inflags.GetValueInt("bias") != 0) dumpBufferToFile("dump_bias.bin", b.data.data(), b.data.size()); if(is_bwd || is_wrw) dumpBufferToFile("dump_dout.bin", dout.data.data(), out_sz); } status_t status = STATUS_SUCCESS; if(is_fwd || is_wrw) { in_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); status |= in_dev->ToGPU(q, in.data.data()); } if(is_bwd) { din_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); status |= din_dev->ToGPU(q, din.data()); } if(is_fwd || is_bwd) { wei_dev = std::unique_ptr(new GPUMem(ctx, wei_sz, sizeof(Tgpu))); status |= wei_dev->ToGPU(q, wei.data.data()); } if(is_wrw) { dwei_dev = std::unique_ptr(new GPUMem(ctx, wei_sz, sizeof(Tgpu))); status |= dwei_dev->ToGPU(q, dwei.data()); } if(is_bwd || is_wrw) { dout_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); status |= dout_dev->ToGPU(q, dout.data.data()); } if(is_fwd) { out_dev = std::unique_ptr( new GPUMem(ctx, out_sz, is_int8 ? sizeof(float) : sizeof(Tgpu))); status |= (is_int8 ? out_dev->ToGPU(q, out_int8.data()) : out_dev->ToGPU(q, out.data.data())); } if(status != STATUS_SUCCESS) { std::cout << "Error copying data to GPU, status = " << status << std::endl; return miopenStatusNotInitialized; } return miopenStatusSuccess; } template bool ConvDriver::UseGPUReference() { if(miopen::IsEnabled(MIOPEN_DRIVER_USE_GPU_REFERENCE{})) { if(miopen_type{} == miopenFloat && (miopen_type{} == miopenFloat || miopen_type{} == miopenHalf || miopen_type{} == miopenBFloat16)) return true; else return false; } else return false; } template int ConvDriver::FindForward(int& ret_algo_count, int request_algo_count, std::vector& perf_results, context_t ctx) { bool is_transform = IsInputTensorTransform(); fwd_auxiliary.resume(wall_enabled); ResizeWorkspaceDev(ctx, ws_sizeof_find_fwd); const auto rc = miopenFindConvolutionForwardAlgorithm( GetHandle(), (is_transform ? inputTensor_vect4 : inputTensor), (is_transform ? in_vect4_dev->GetMem() : in_dev->GetMem()), (is_transform ? weightTensor_vect4 : weightTensor), (is_transform ? wei_vect4_dev->GetMem() : wei_dev->GetMem()), convDesc, outputTensor, out_dev->GetMem(), request_algo_count, &ret_algo_count, perf_results.data(), workspace_dev != nullptr ? workspace_dev->GetMem() : nullptr, ws_sizeof_find_fwd, (inflags.GetValueInt("search") == 1) ? true : false); fwd_auxiliary.pause(wall_enabled); return rc; } template void ConvDriver::PrintForwardTime(const float kernel_total_time, const float kernel_first_time) const { float kernel_average_time = num_iterations > 1 ? (kernel_total_time - kernel_first_time) / (num_iterations - 1) : kernel_first_time; printf("GPU Kernel Time Forward Conv. Elapsed: %f ms (average)\n", kernel_average_time); const auto num_dim = miopen::deref(inputTensor).GetSize() - 2; if(num_dim != 2 && num_dim != 3) { printf("stats: for conv%dd\n", num_dim); return; } int group_count = std::max(inflags.GetValueInt("group_count"), 1); if(num_dim == 2) { int in_n, in_c, in_h, in_w; std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); int wei_c, wei_n, wei_h, wei_w; std::tie(wei_c, wei_n, wei_h, wei_w) = miopen::tien<4>(miopen::deref(weightTensor).GetLengths()); int out_n, out_c, out_h, out_w; std::tie(out_n, out_c, out_h, out_w) = miopen::tien<4>(miopen::deref(outputTensor).GetLengths()); size_t flopCnt = 2L * in_n * in_c * wei_h * wei_w * out_c * out_h * out_w / group_count; size_t inputBytes = in_n * in_c * in_h * in_w * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); size_t weightBytes = wei_n * wei_c * wei_h * wei_w * miopen::GetTypeSize(miopen::deref(weightTensor).GetType()); size_t readBytes = inputBytes + weightBytes; size_t outputBytes = 1.0 * out_n * out_c * out_h * out_w * miopen::GetTypeSize(miopen::deref(outputTensor).GetType()); printf("stats: name, n, c, ho, wo, x, y, k, flopCnt, bytesRead, bytesWritten, GFLOPs, " "GB/s, timeMs\n"); printf("stats: %s%dx%du%d, %u, %u, %u, %u, %u, %u, %u, %zu, %zu, %zu, %.0f, %.0f, %f\n", "fwd-conv", wei_h, wei_w, miopen::deref(convDesc).GetConvStrides()[0], in_n, in_c, out_h, out_w, wei_h, wei_w, out_c, flopCnt, readBytes, outputBytes, flopCnt / kernel_average_time / 1e6, (readBytes + outputBytes) / kernel_average_time / 1e6, kernel_average_time); } else { // 3d int in_n, in_c, in_d, in_h, in_w; std::tie(in_n, in_c, in_d, in_h, in_w) = miopen::tien<5>(miopen::deref(inputTensor).GetLengths()); int wei_c, wei_n, wei_d, wei_h, wei_w; std::tie(wei_c, wei_n, wei_d, wei_h, wei_w) = miopen::tien<5>(miopen::deref(weightTensor).GetLengths()); int out_n, out_c, out_d, out_h, out_w; std::tie(out_n, out_c, out_d, out_h, out_w) = miopen::tien<5>(miopen::deref(outputTensor).GetLengths()); size_t flopCnt = 2L * in_n * in_c * in_d * wei_h * wei_w * wei_d * out_c * out_d * out_h * out_w / group_count; size_t inputBytes = in_n * in_c * in_d * in_h * in_w * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); size_t weightBytes = wei_n * wei_c * wei_d * wei_h * wei_w * miopen::GetTypeSize(miopen::deref(weightTensor).GetType()); size_t readBytes = inputBytes + weightBytes; size_t outputBytes = 1.0 * out_n * out_c * out_d * out_h * out_w * miopen::GetTypeSize(miopen::deref(outputTensor).GetType()); printf("stats: name , n, c, do, ho, wo, z, y, x, k, flopCnt, bytesRead, bytesWritten, " "GFLOPs, " "GB/s, timeMs\n"); printf("stats: %s%dx%dx%du%d, %u, %u, %u, %u, %u, %u, %u, %u, %u, %zu, %zu, %zu, " "%.0f, %.0f, %f\n", "fwd-conv", wei_d, wei_h, wei_w, miopen::deref(convDesc).GetConvStrides()[0], in_n, in_c, out_d, out_h, out_w, wei_d, wei_h, wei_w, out_c, flopCnt, readBytes, outputBytes, flopCnt / kernel_average_time / 1e6, (readBytes + outputBytes) / kernel_average_time / 1e6, kernel_average_time); } } /// Always warm-ups Find API. Why: this is definitely required for Find mode. /// For Immediate mode, this guarantees that we won't hit fallback. /// Immediate mode API is warmed-up only when driver is used in Immediate mode. template int ConvDriver::RunWarmupFindForwardGPU() { if(!warmup_enabled) return 0; AutoMiopenWarmupMode warmupMode; int find_count; miopenConvAlgoPerf_t find_result; warmup_wall_total.resume(wall_enabled); auto rc = miopenFindConvolutionForwardAlgorithm(GetHandle(), warmupInputTensor, warmup_in_dev->GetMem(), warmupWeightTensor, warmup_wei_dev->GetMem(), warmupConvDesc, warmupOutputTensor, warmup_out_dev->GetMem(), 1, &find_count, &find_result, nullptr, 0, false); warmup_wall_total.pause(wall_enabled); if(rc != miopenStatusSuccess) return 10; if(find_count == 0) return 20; miopenConvSolution_t solution; if(immediate_solution) { std::size_t immed_count; warmup_wall_total.resume(wall_enabled); rc = miopenConvolutionForwardGetSolutionCount(handle, warmupWeightTensor, warmupInputTensor, warmupConvDesc, warmupOutputTensor, &immed_count); warmup_wall_total.pause(wall_enabled); if(rc != miopenStatusSuccess) return 30; if(immed_count < 1) return 40; warmup_wall_total.resume(wall_enabled); rc = miopenConvolutionForwardGetSolution(handle, warmupWeightTensor, warmupInputTensor, warmupConvDesc, warmupOutputTensor, 1, &immed_count, &solution); warmup_wall_total.pause(wall_enabled); if(rc != miopenStatusSuccess) return 50; if(immed_count < 1) return 60; if(solution.workspace_size != 0) return 70; warmup_wall_total.resume(wall_enabled); rc = miopenConvolutionForwardImmediate(handle, warmupWeightTensor, warmup_wei_dev->GetMem(), warmupInputTensor, warmup_in_dev->GetMem(), warmupConvDesc, warmupOutputTensor, warmup_out_dev->GetMem(), nullptr, 0, solution.solution_id); warmup_wall_total.pause(wall_enabled); if(rc != miopenStatusSuccess) return 80; } warmup_wall_total.stop(wall_enabled); std::ostringstream ss; ss << "Warm-up: "; if(wall_enabled) ss << "Wall-clock Total Time: " << warmup_wall_total.gettime_ms() << " ms, "; ss << "Find Algorithm: " << find_result.fwd_algo; if(immediate_solution) ss << ", Immediate Algorithm: " << miopen::ConvolutionAlgoToString(solution.algorithm) << '[' << solution.solution_id << ']'; ss << std::endl; std::cout << ss.str(); return rc; } template int ConvDriver::RunForwardGPU() { if(!is_fwd) return 0; int rc; bool is_transform = IsInputTensorTransform(); if(is_transform) { float aph = 1.0; float bta = 0.0; miopenTransformTensor(GetHandle(), &aph, inputTensor, in_dev->GetMem(), &bta, inputTensor_vect4, in_vect4_dev->GetMem()); miopenTransformTensor(GetHandle(), &aph, weightTensor, wei_dev->GetMem(), &bta, weightTensor_vect4, wei_vect4_dev->GetMem()); } if(immediate_solution) rc = RunForwardGpuImmed(is_transform); else rc = RunForwardGpuFind(is_transform); is_fwd_run_failed = (rc != 0); if(rc != miopenStatusSuccess) return rc; if(inflags.GetValueInt("bias") != 0) { float alpha = static_cast(1), beta = static_cast(0); miopenConvolutionForwardBias(GetHandle(), &alpha, biasTensor, b_dev->GetMem(), &beta, outputTensor, out_dev->GetMem()); if(time_enabled) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); printf("GPU Kernel Time Forward Conv. Bias Elapsed: %f ms\n", time); } } bool is_int8 = data_type == miopenInt8 || data_type == miopenInt8x4; if(is_int8) out_dev->FromGPU(GetStream(), out_int8.data()); else out_dev->FromGPU(GetStream(), out.data.data()); if(inflags.GetValueInt("dump_output")) { if(is_int8) dumpBufferToFile("dump_fwd_out_gpu.bin", out_int8.data(), out_int8.size()); else dumpBufferToFile("dump_fwd_out_gpu.bin", out.data.data(), out.data.size()); } return rc; } template void ConvDriver::GetSolutionAfterFind( const miopenConvAlgoPerf_t& found, const ConvDriver::Direction& direction, const miopenTensorDescriptor_t& in_tensor, const miopenTensorDescriptor_t& wei_tensor, const miopenTensorDescriptor_t& out_tensor, miopenConvSolution_t& solution) { AutoMiopenWarmupMode warmupMode; // Shut logging. miopenConvAlgorithm_t found_algo; switch(direction) { case Direction::Fwd: found_algo = static_cast(found.fwd_algo); break; case Direction::Bwd: found_algo = static_cast(found.bwd_data_algo); break; case Direction::WrW: found_algo = static_cast(found.bwd_weights_algo); break; case Direction::BwdBias: // nop MIOPEN_THROW("BwdBias is not supported"); } std::size_t immed_count = 0; miopenStatus_t rc = miopenStatusUnknownError; switch(direction) { case Direction::Fwd: rc = miopenConvolutionForwardGetSolution( handle, wei_tensor, in_tensor, convDesc, out_tensor, 1, &immed_count, &solution); break; case Direction::Bwd: rc = miopenConvolutionBackwardDataGetSolution( handle, out_tensor, wei_tensor, convDesc, in_tensor, 1, &immed_count, &solution); break; case Direction::WrW: rc = miopenConvolutionBackwardWeightsGetSolution( handle, out_tensor, in_tensor, convDesc, wei_tensor, 1, &immed_count, &solution); break; case Direction::BwdBias: // nop break; } if(rc != miopenStatusSuccess // (formatting) || immed_count < 1 // It should not be so if Find succeeded. || found_algo != solution.algorithm // These must match. || solution.time < 0) // Fallback mode (no entry in find-db -- disabled?) { // Ignore errors, just skip printing the solver information. solution.solution_id = 0; } } template int ConvDriver::RunForwardGpuFind(const bool is_transform) { int ret_algo_count; int request_algo_count = 2; // The library returns `request_algo_count` algorithms to the caller. However this does // not affect auto-tuning and find-db updates. Internally, the library searches for // *all* available algorithms during Find(), -- regardless of how many algorithms // requested, -- so perf-db and find-db are fully updated. std::vector perf_results(request_algo_count); #if MIOPEN_BACKEND_OPENCL cl_context ctx; clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); #elif MIOPEN_BACKEND_HIP uint32_t ctx = 0; #endif auto rc = FindForward(ret_algo_count, request_algo_count, perf_results, ctx); if(rc != miopenStatusSuccess) return rc; if(ret_algo_count == 0) throw std::runtime_error("Find Forward Conv. ret_algo_count == 0"); float alpha = static_cast(1), beta = static_cast(0); float kernel_total_time = 0.0; float kernel_first_time = 0.0; const auto algo = perf_results[0].fwd_algo; // use the fastest algo const auto ws_size = perf_results[0].memory; is_fwd_igemm = (algo == miopenConvolutionFwdAlgoImplicitGEMM); ResizeWorkspaceDev(ctx, ws_size); wall.start(wall_enabled); auto in_tens = (is_transform ? inputTensor_vect4 : inputTensor); auto in_buff = (is_transform ? in_vect4_dev->GetMem() : in_dev->GetMem()); auto wei_tens = (is_transform ? weightTensor_vect4 : weightTensor); auto wei_buff = (is_transform ? wei_vect4_dev->GetMem() : wei_dev->GetMem()); for(int i = 0; i < num_iterations; i++) { rc = miopenConvolutionForward(GetHandle(), &alpha, in_tens, in_buff, wei_tens, wei_buff, convDesc, algo, &beta, outputTensor, out_dev->GetMem(), workspace_dev != nullptr ? workspace_dev->GetMem() : nullptr, ws_size); if(rc != miopenStatusSuccess) return rc; if(time_enabled) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); kernel_total_time += time; if(i == 0) kernel_first_time = time; } } if(wall_enabled) { wall.stop(); fwd_auxiliary.stop(); fwd_auxiliary_gwss.stop(); std::cout << "Wall-clock Time Forward Conv. Elapsed: " << (wall.gettime_ms() / num_iterations) << " ms" << ", Auxiliary API calls: " << fwd_auxiliary.gettime_ms() << " ms" << " (GWSS: " << fwd_auxiliary_gwss.gettime_ms() << ')' << std::endl; } if(time_enabled) { miopenConvSolution_t solution; GetSolutionAfterFind( perf_results[0], Direction::Fwd, in_tens, wei_tens, outputTensor, solution); std::cout << "MIOpen Forward Conv. " << AlgorithmSolutionToString(solution) << std::endl; PrintForwardTime(kernel_total_time, kernel_first_time); } return rc; } template int ConvDriver::RunForwardGpuImmed(const bool is_transform) { std::size_t count; fwd_auxiliary.resume(wall_enabled); auto rc = miopenConvolutionForwardGetSolutionCount(handle, (is_transform ? weightTensor_vect4 : weightTensor), (is_transform ? inputTensor_vect4 : inputTensor), convDesc, outputTensor, &count); fwd_auxiliary.pause(wall_enabled); if(rc != miopenStatusSuccess) return rc; if(count < 1) return miopenStatusNotImplemented; auto solutions = std::vector(count); fwd_auxiliary.resume(wall_enabled); rc = miopenConvolutionForwardGetSolution(handle, (is_transform ? weightTensor_vect4 : weightTensor), (is_transform ? inputTensor_vect4 : inputTensor), convDesc, outputTensor, count, &count, solutions.data()); fwd_auxiliary.pause(wall_enabled); if(rc != miopenStatusSuccess) return rc; std::cout << "Forward Conv solutions available: " << count << std::endl; if(count < 1) return miopenStatusNotImplemented; solutions.resize(count); const miopenConvSolution_t* selected = nullptr; for(const auto& s : solutions) PrintImmedSolutionInfo(s); if(*immediate_solution == 0) selected = &solutions.front(); else for(const auto& s : solutions) if(*immediate_solution == s.solution_id) { selected = &s; break; } miopenConvSolution_t voluntary = { -1.0, 0, *immediate_solution, static_cast(-1)}; if(selected == nullptr) { std::cout << "Warning: Solution id (" << *immediate_solution << ") is not reported by the library. Trying it anyway..." << std::endl; selected = &voluntary; } std::size_t ws_size; fwd_auxiliary.resume(wall_enabled); fwd_auxiliary_gwss.resume(wall_enabled); rc = miopenConvolutionForwardGetSolutionWorkspaceSize( handle, (is_transform ? weightTensor_vect4 : weightTensor), (is_transform ? inputTensor_vect4 : inputTensor), convDesc, outputTensor, selected->solution_id, &ws_size); fwd_auxiliary_gwss.pause(wall_enabled); fwd_auxiliary.pause(wall_enabled); if(rc != miopenStatusSuccess) return rc; #if MIOPEN_BACKEND_OPENCL cl_context ctx; clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); #elif MIOPEN_BACKEND_HIP uint32_t ctx = 0; #endif auto ws = std::unique_ptr{ws_size > 0 ? new GPUMem{ctx, ws_size, 1} : nullptr}; fwd_auxiliary.resume(wall_enabled); rc = miopenConvolutionForwardCompileSolution(handle, (is_transform ? weightTensor_vect4 : weightTensor), (is_transform ? inputTensor_vect4 : inputTensor), convDesc, outputTensor, selected->solution_id); fwd_auxiliary.pause(wall_enabled); if(rc != miopenStatusSuccess) return rc; float kernel_total_time = 0.0; float kernel_first_time = 0.0; wall.start(wall_enabled); for(int i = 0; i < num_iterations; i++) { rc = miopenConvolutionForwardImmediate( handle, (is_transform ? weightTensor_vect4 : weightTensor), (is_transform ? wei_vect4_dev->GetMem() : wei_dev->GetMem()), (is_transform ? inputTensor_vect4 : inputTensor), (is_transform ? in_vect4_dev->GetMem() : in_dev->GetMem()), convDesc, outputTensor, out_dev->GetMem(), ws ? ws->GetMem() : nullptr, ws_size, selected->solution_id); if(rc != miopenStatusSuccess) return rc; if(time_enabled) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); kernel_total_time += time; if(i == 0) { kernel_first_time = time; if(wall_enabled && num_iterations > 1) wall.start(); // The 1st is warm-up. Disregard it in wall time. } } } if(wall_enabled) { wall.stop(); fwd_auxiliary.stop(); fwd_auxiliary_gwss.stop(); const auto wall_iterations = (num_iterations > 1 ? num_iterations - 1 : 1); std::cout << "Wall-clock Time Forward Conv. Elapsed: " << (wall.gettime_ms() / wall_iterations) << " ms" << ", Auxiliary API calls: " << fwd_auxiliary.gettime_ms() << " ms" << " (GWSS: " << fwd_auxiliary_gwss.gettime_ms() << ')' << std::endl; } if(time_enabled) { std::cout << "MIOpen Forward Conv. " << AlgorithmSolutionToString(*selected) << std::endl; PrintForwardTime(kernel_total_time, kernel_first_time); } is_fwd_igemm = (selected->algorithm == miopenConvolutionAlgoImplicitGEMM); return miopenStatusSuccess; } template int ConvDriver::RunForwardCPU() { if(miopen::deref(convDesc).mode == miopenTranspose) { cpu_convolution_backward_data(miopen::deref(convDesc).GetSpatialDimension(), outhost, wei, in, miopen::deref(convDesc).GetConvPads(), miopen::deref(convDesc).GetConvStrides(), miopen::deref(convDesc).GetConvDilations(), miopen::deref(convDesc).GetGroupCount()); if(inflags.GetValueInt("bias") != 0) { cpu_bias_forward(outhost, b); } } else { cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(), in, wei, outhost, miopen::deref(convDesc).GetConvPads(), miopen::deref(convDesc).GetConvStrides(), miopen::deref(convDesc).GetConvDilations(), miopen::deref(convDesc).GetGroupCount()); if(inflags.GetValueInt("bias") != 0) { outhost.par_for_each([&](auto out_n_id, auto out_k_id, auto... out_spatial_id_pack) { outhost(out_n_id, out_k_id, out_spatial_id_pack...) = double(outhost(out_n_id, out_k_id, out_spatial_id_pack...)) + double(b.data[out_k_id]); }); } } if(inflags.GetValueInt("dump_output")) { dumpBufferToFile("dump_fwd_out_cpu.bin", outhost.data.data(), outhost.data.size()); } TrySaveVerificationCache(Direction::Fwd, outhost.data); return 0; } template int ConvDriver::RunForwardGPUReference() { AutoConvDirectNaiveAlwaysEnable naive_conv_enable; if(inflags.GetValueInt("bias") != 0) { std::cout << "gpu reference convolution does not support bias yet" << std::endl; return -1; } auto ref_solution_id = miopen::deref(convDesc).mode == miopenTranspose ? miopen::solver::Id("ConvDirectNaiveConvBwd").Value() : miopen::solver::Id("ConvDirectNaiveConvFwd").Value(); auto rc = miopenConvolutionForwardImmediate(handle, weightTensor, wei_dev->GetMem(), inputTensor, in_dev->GetMem(), convDesc, outputTensor, out_dev->GetMem(), nullptr, 0, ref_solution_id); if(rc != miopenStatusSuccess) { std::cout << "reference kernel fail to run " << miopen::solver::Id(ref_solution_id).ToString() << std::endl; return rc; } if(miopen_type{} == miopen_type{}) out_dev->FromGPU(GetStream(), outhost.data.data()); else { auto out_tmp = tensor(miopen::deref(outputTensor).GetLengths(), miopen::deref(outputTensor).GetStrides()); out_dev->FromGPU(GetStream(), out_tmp.data.data()); for(int i = 0; i < out_tmp.data.size(); i++) { outhost.data[i] = static_cast(out_tmp.data[i]); } } if(inflags.GetValueInt("dump_output")) { dumpBufferToFile( "dump_fwd_out_gpu_ref.bin", outhost.data.data(), outhost.data.size()); } // TrySaveVerificationCache(Direction::Fwd, outhost.data); return 0; } template int ConvDriver::FindBackwardData(int& ret_algo_count, int request_algo_count, std::vector& perf_results, context_t ctx) { bwd_auxiliary.resume(wall_enabled); ResizeWorkspaceDev(ctx, ws_sizeof_find_bwd); const auto rc = miopenFindConvolutionBackwardDataAlgorithm( GetHandle(), outputTensor, dout_dev->GetMem(), weightTensor, wei_dev->GetMem(), convDesc, inputTensor, din_dev->GetMem(), request_algo_count, &ret_algo_count, perf_results.data(), workspace_dev != nullptr ? workspace_dev->GetMem() : nullptr, ws_sizeof_find_bwd, (inflags.GetValueInt("search") == 1) ? true : false); bwd_auxiliary.pause(wall_enabled); return rc; } template int ConvDriver::FindBackwardWeights(int& ret_algo_count, int request_algo_count, std::vector& perf_results, context_t ctx) { wrw_auxiliary.resume(wall_enabled); ResizeWorkspaceDev(ctx, ws_sizeof_find_wrw); const auto rc = miopenFindConvolutionBackwardWeightsAlgorithm( GetHandle(), outputTensor, dout_dev->GetMem(), inputTensor, in_dev->GetMem(), convDesc, weightTensor, dwei_dev->GetMem(), request_algo_count, &ret_algo_count, perf_results.data(), workspace_dev != nullptr ? workspace_dev->GetMem() : nullptr, ws_sizeof_find_wrw, (inflags.GetValueInt("search") == 1) ? true : false); wrw_auxiliary.pause(wall_enabled); return rc; } template int ConvDriver::RunBackwardGPU() { if(data_type == miopenInt8 || data_type == miopenInt8x4) { std::cout << "Int8 Backward Convolution is not supported" << std::endl; return 0; } if(!(is_bwd || is_wrw)) return 0; int ret = 0; if(is_bwd) { auto rc = immediate_solution ? RunBackwardDataGpuImmed() : RunBackwardDataGpuFind(); is_bwd_run_failed = (rc != 0); ret |= rc; } if(is_wrw) { auto rc = immediate_solution ? RunBackwardWrwGpuImmed() : RunBackwardWrwGpuFind(); is_wrw_run_failed = (rc != 0); ret |= (rc << 16); // Differentiate WrW and Bwd error codes. } if(inflags.GetValueInt("dump_output")) { if(is_bwd) dumpBufferToFile("dump_bwd_din_gpu.bin", din.data(), din.size()); if(is_wrw) dumpBufferToFile("dump_bwd_dwei_gpu.bin", dwei.data(), dwei.size()); } if(inflags.GetValueInt("bias") != 0) { float alpha = static_cast(1), beta = static_cast(0); ret |= miopenConvolutionBackwardBias(GetHandle(), &alpha, outputTensor, dout_dev->GetMem(), &beta, biasTensor, db_dev->GetMem()); if(time_enabled) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); printf("GPU Kernel Time Backward Bias Conv. Elapsed: %f ms\n", time); } db_dev->FromGPU(GetStream(), db.data()); if(inflags.GetValueInt("dump_output")) { dumpBufferToFile("dump_bwd_db_gpu.bin", db.data(), db.size()); } } return ret; } template int ConvDriver::RunBackwardDataGpuFind() { int ret_algo_count; int request_algo_count = 2; std::vector perf_results_data(request_algo_count); #if MIOPEN_BACKEND_OPENCL cl_context ctx; clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); #elif MIOPEN_BACKEND_HIP uint32_t ctx = 0; #endif auto rc = FindBackwardData(ret_algo_count, request_algo_count, perf_results_data, ctx); if(rc != miopenStatusSuccess) return rc; if(ret_algo_count == 0) throw std::runtime_error("Find Backward Data Conv. ret_algo_count == 0"); float kernel_total_time = 0.0; float kernel_first_time = 0.0; float alpha = static_cast(1), beta = static_cast(0); const auto algo = perf_results_data[0].bwd_data_algo; const auto ws_size = perf_results_data[0].memory; is_bwd_igemm = (algo == miopenConvolutionBwdDataAlgoImplicitGEMM); ResizeWorkspaceDev(ctx, ws_size); wall.start(wall_enabled); for(int i = 0; i < num_iterations; i++) { rc = miopenConvolutionBackwardData(GetHandle(), &alpha, outputTensor, dout_dev->GetMem(), weightTensor, wei_dev->GetMem(), convDesc, algo, &beta, inputTensor, din_dev->GetMem(), workspace_dev != nullptr ? workspace_dev->GetMem() : nullptr, ws_size); if(rc != miopenStatusSuccess) return rc; if(time_enabled) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); kernel_total_time += time; if(i == 0) kernel_first_time = time; } } if(wall_enabled) { wall.stop(); bwd_auxiliary.stop(); bwd_auxiliary_gwss.stop(); std::cout << "Wall-clock Time Backward Data Conv. Elapsed: " << (wall.gettime_ms() / num_iterations) << " ms" << ", Auxiliary API calls: " << bwd_auxiliary.gettime_ms() << " ms" << " (GWSS: " << bwd_auxiliary_gwss.gettime_ms() << ')' << std::endl; } if(time_enabled) { miopenConvSolution_t solution; GetSolutionAfterFind(perf_results_data[0], Direction::Bwd, inputTensor, weightTensor, outputTensor, solution); std::cout << "MIOpen Backward Data Conv. " << AlgorithmSolutionToString(solution) << std::endl; PrintBackwardDataTime(kernel_total_time, kernel_first_time); } din_dev->FromGPU(GetStream(), din.data()); return rc; } template void ConvDriver::PrintBackwardDataTime(float kernel_total_time, float kernel_first_time) { float kernel_average_time = num_iterations > 1 ? (kernel_total_time - kernel_first_time) / (num_iterations - 1) : kernel_first_time; printf("GPU Kernel Time Backward Data Conv. Elapsed: %f ms (average)\n", kernel_average_time); const auto num_dim = miopen::deref(inputTensor).GetSize() - 2; if(num_dim != 2 && num_dim != 3) { printf("stats: for conv%dd\n", num_dim); return; } int group_count = std::max(inflags.GetValueInt("group_count"), 1); if(num_dim == 2) { int in_n, in_c, in_h, in_w; std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); int wei_c, wei_n, wei_h, wei_w; std::tie(wei_c, wei_n, wei_h, wei_w) = miopen::tien<4>(miopen::deref(weightTensor).GetLengths()); int out_n, out_c, out_h, out_w; std::tie(out_n, out_c, out_h, out_w) = miopen::tien<4>(miopen::deref(outputTensor).GetLengths()); size_t flopCnt = 2L * in_n * in_c * wei_h * wei_w * out_c * out_h * out_w / group_count; size_t weightBytes = wei_n * wei_c * wei_h * wei_w * miopen::GetTypeSize(miopen::deref(weightTensor).GetType()); size_t inputBytes = in_n * in_c * out_c * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); size_t readBytes = inputBytes + weightBytes; size_t outputBytes = 1.0 * out_n * out_c * out_h * out_w * miopen::GetTypeSize(miopen::deref(outputTensor).GetType()); printf("stats: name, n, c, ho, wo, x, y, k, flopCnt, bytesRead, bytesWritten, GFLOPs, " "GB/s, timeMs\n"); printf("stats: %s%dx%du%d, %u, %u, %u, %u, %u, %u, %u, %zu, %zu, %zu, %.0f, %.0f, %f\n", "bwdd-conv", wei_h, wei_w, miopen::deref(convDesc).GetConvStrides()[0], in_n, in_c, wei_h, wei_w, out_c, out_h, out_w, flopCnt, readBytes, outputBytes, flopCnt / kernel_average_time / 1e6, (readBytes + outputBytes) / kernel_average_time / 1e6, kernel_average_time); } else { // 3d int in_n, in_c, in_d, in_h, in_w; std::tie(in_n, in_c, in_d, in_h, in_w) = miopen::tien<5>(miopen::deref(inputTensor).GetLengths()); int wei_c, wei_n, wei_d, wei_h, wei_w; std::tie(wei_c, wei_n, wei_d, wei_h, wei_w) = miopen::tien<5>(miopen::deref(weightTensor).GetLengths()); int out_n, out_c, out_d, out_h, out_w; std::tie(out_n, out_c, out_d, out_h, out_w) = miopen::tien<5>(miopen::deref(outputTensor).GetLengths()); size_t flopCnt = 2L * in_n * in_c * wei_d * wei_h * wei_w * out_c * out_d * out_h * out_w / group_count; size_t weightBytes = wei_n * wei_c * wei_d * wei_h * wei_w * miopen::GetTypeSize(miopen::deref(weightTensor).GetType()); size_t inputBytes = in_n * in_c * out_c * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); size_t readBytes = inputBytes + weightBytes; size_t outputBytes = 1.0 * out_n * out_c * out_d * out_h * out_w * miopen::GetTypeSize(miopen::deref(outputTensor).GetType()); printf( "stats: name, n, c, do, ho, wo, z, x, y, k, flopCnt, bytesRead, bytesWritten, GFLOPs, " "GB/s, timeMs\n"); printf("stats: %s%dx%dx%du%d, %u, %u, %u, %u, %u, %u, %u, %u, %u %zu, %zu, %zu, %.0f, " "%.0f, %f\n", "bwdd-conv", wei_d, wei_h, wei_w, miopen::deref(convDesc).GetConvStrides()[0], in_n, in_c, wei_d, wei_h, wei_w, out_c, out_d, out_h, out_w, flopCnt, readBytes, outputBytes, flopCnt / kernel_average_time / 1e6, (readBytes + outputBytes) / kernel_average_time / 1e6, kernel_average_time); } } template int ConvDriver::RunBackwardWrwGpuFind() { int ret_algo_count; int request_algo_count = 2; float kernel_total_time = 0.0; float kernel_first_time = 0.0; float alpha = static_cast(1), beta = static_cast(0); std::vector perf_results_weights(request_algo_count); #if MIOPEN_BACKEND_OPENCL cl_context ctx; clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); #elif MIOPEN_BACKEND_HIP uint32_t ctx = 0; #endif auto rc = FindBackwardWeights(ret_algo_count, request_algo_count, perf_results_weights, ctx); if(rc != miopenStatusSuccess) return rc; if(ret_algo_count == 0) throw std::runtime_error("Find Backward Weights Conv. ret_algo_count == 0"); kernel_total_time = 0.0; kernel_first_time = 0.0; const auto algo = perf_results_weights[0].bwd_weights_algo; const auto ws_size = perf_results_weights[0].memory; is_wrw_winograd = (algo == miopenConvolutionBwdWeightsAlgoWinograd); is_wrw_igemm = (algo == miopenConvolutionBwdWeightsAlgoImplicitGEMM); ResizeWorkspaceDev(ctx, ws_size); wall.start(wall_enabled); for(int i = 0; i < num_iterations; i++) { rc = miopenConvolutionBackwardWeights(GetHandle(), &alpha, outputTensor, dout_dev->GetMem(), inputTensor, in_dev->GetMem(), convDesc, algo, &beta, weightTensor, dwei_dev->GetMem(), workspace_dev != nullptr ? workspace_dev->GetMem() : nullptr, ws_size); if(rc != miopenStatusSuccess) return rc; if(time_enabled) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); kernel_total_time += time; if(i == 0) kernel_first_time = time; } } if(wall_enabled) { wall.stop(); wrw_auxiliary.stop(); wrw_auxiliary_gwss.stop(); std::cout << "Wall-clock Time Backward Weights Conv. Elapsed: " << (wall.gettime_ms() / num_iterations) << " ms" << ", Auxiliary API calls: " << wrw_auxiliary.gettime_ms() << " ms" << " (GWSS: " << wrw_auxiliary_gwss.gettime_ms() << ')' << std::endl; } if(time_enabled) { miopenConvSolution_t solution; GetSolutionAfterFind(perf_results_weights[0], Direction::WrW, inputTensor, weightTensor, outputTensor, solution); std::cout << "MIOpen Backward Weights Conv. " << AlgorithmSolutionToString(solution) << std::endl; PrintBackwardWrwTime(kernel_total_time, kernel_first_time); } dwei_dev->FromGPU(GetStream(), dwei.data()); return rc; } template void ConvDriver::PrintBackwardWrwTime(float kernel_total_time, float kernel_first_time) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); float kernel_average_time = num_iterations > 1 ? (kernel_total_time - kernel_first_time) / (num_iterations - 1) : kernel_first_time; printf("GPU Kernel Time Backward Weights Conv. Elapsed: %f ms (average)\n", kernel_average_time); const auto num_dim = miopen::deref(inputTensor).GetSize() - 2; if(num_dim != 2 && num_dim != 3) { printf("stats: for conv%dd\n", num_dim); return; } int group_count = std::max(inflags.GetValueInt("group_count"), 1); if(num_dim == 2) { int in_n, in_c, in_h, in_w; std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); int wei_c, wei_n, wei_h, wei_w; std::tie(wei_c, wei_n, wei_h, wei_w) = miopen::tien<4>(miopen::deref(weightTensor).GetLengths()); int out_n, out_c, out_h, out_w; std::tie(out_n, out_c, out_h, out_w) = miopen::tien<4>(miopen::deref(outputTensor).GetLengths()); size_t flopCnt = 2L * in_n * in_c * wei_h * wei_w * out_c * out_h * out_w / group_count; size_t readBytes = 0; size_t outputBytes = 0; printf("stats: name, n, c, ho, wo, x, y, k, flopCnt, bytesRead, bytesWritten, GFLOPs, " "GB/s, timeMs\n"); printf("stats: %s%dx%du%d, %u, %u, %u, %u, %u, %u, %u, %zu, %zu, %zu, %.0f, %.0f, %f\n", "bwdw-conv", wei_h, wei_w, miopen::deref(convDesc).GetConvStrides()[0], in_n, in_c, out_h, out_w, wei_h, wei_w, out_c, flopCnt, readBytes, outputBytes, flopCnt / kernel_average_time / 1e6, (readBytes + outputBytes) / kernel_average_time / 1e6, kernel_average_time); } else { // 3d int in_n, in_c, in_d, in_h, in_w; std::tie(in_n, in_c, in_d, in_h, in_w) = miopen::tien<5>(miopen::deref(inputTensor).GetLengths()); int wei_c, wei_n, wei_d, wei_h, wei_w; std::tie(wei_c, wei_n, wei_d, wei_h, wei_w) = miopen::tien<5>(miopen::deref(weightTensor).GetLengths()); int out_n, out_c, out_d, out_h, out_w; std::tie(out_n, out_c, out_d, out_h, out_w) = miopen::tien<5>(miopen::deref(outputTensor).GetLengths()); size_t flopCnt = 2L * in_n * in_c * wei_d * wei_h * wei_w * out_c * out_d * out_h * out_w / group_count; size_t readBytes = 0; size_t outputBytes = 0; printf( "stats: name, n, c, do, ho, wo, z, x, y, k, flopCnt, bytesRead, bytesWritten, GFLOPs, " "GB/s, timeMs\n"); printf("stats: %s%dx%dx%du%d, %u, %u, %u, %u, %u, %u, %u, %u, %u, %zu, %zu, %zu, %.0f, " "%.0f, %f\n ", "bwdw-conv", wei_d, wei_h, wei_w, miopen::deref(convDesc).GetConvStrides()[0], in_n, in_c, out_d, out_h, out_w, wei_d, wei_h, wei_w, out_c, flopCnt, readBytes, outputBytes, flopCnt / kernel_average_time / 1e6, (readBytes + outputBytes) / kernel_average_time / 1e6, kernel_average_time); } } template int ConvDriver::RunBackwardDataGpuImmed() { std::size_t count; bwd_auxiliary.resume(wall_enabled); auto rc = miopenConvolutionBackwardDataGetSolutionCount( handle, outputTensor, weightTensor, convDesc, inputTensor, &count); bwd_auxiliary.pause(wall_enabled); if(rc != miopenStatusSuccess) return rc; if(count < 1) return miopenStatusNotImplemented; auto solutions = std::vector(count); bwd_auxiliary.resume(wall_enabled); rc = miopenConvolutionBackwardDataGetSolution( handle, outputTensor, weightTensor, convDesc, inputTensor, count, &count, solutions.data()); bwd_auxiliary.pause(wall_enabled); if(rc != miopenStatusSuccess) return rc; std::cout << "Backward Data Conv solutions available: " << count << std::endl; if(count < 1) return miopenStatusNotImplemented; solutions.resize(count); const miopenConvSolution_t* selected = nullptr; for(const auto& s : solutions) PrintImmedSolutionInfo(s); if(*immediate_solution == 0) selected = &solutions.front(); else for(const auto& s : solutions) if(*immediate_solution == s.solution_id) { selected = &s; break; } miopenConvSolution_t voluntary = { -1.0, 0, *immediate_solution, static_cast(-1)}; if(selected == nullptr) { std::cout << "Warning: Solution id (" << *immediate_solution << ") is not reported by the library. Trying it anyway..." << std::endl; selected = &voluntary; } std::size_t ws_size; bwd_auxiliary.resume(wall_enabled); bwd_auxiliary_gwss.resume(wall_enabled); rc = miopenConvolutionBackwardDataGetSolutionWorkspaceSize( handle, outputTensor, weightTensor, convDesc, inputTensor, selected->solution_id, &ws_size); bwd_auxiliary_gwss.pause(wall_enabled); bwd_auxiliary.pause(wall_enabled); if(rc != miopenStatusSuccess) return rc; #if MIOPEN_BACKEND_OPENCL cl_context ctx; clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); #elif MIOPEN_BACKEND_HIP uint32_t ctx = 0; #endif auto ws = std::unique_ptr{ws_size > 0 ? new GPUMem{ctx, ws_size, 1} : nullptr}; bwd_auxiliary.resume(wall_enabled); rc = miopenConvolutionBackwardDataCompileSolution( handle, outputTensor, weightTensor, convDesc, inputTensor, selected->solution_id); bwd_auxiliary.pause(wall_enabled); float kernel_total_time = 0.0; float kernel_first_time = 0.0; wall.start(wall_enabled); for(int i = 0; i < num_iterations; i++) { rc = miopenConvolutionBackwardDataImmediate(handle, outputTensor, dout_dev->GetMem(), weightTensor, wei_dev->GetMem(), convDesc, inputTensor, din_dev->GetMem(), ws ? ws->GetMem() : nullptr, ws_size, selected->solution_id); if(rc != miopenStatusSuccess) return rc; if(time_enabled) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); kernel_total_time += time; if(i == 0) { kernel_first_time = time; if(wall_enabled && num_iterations > 1) wall.start(); } } } if(wall_enabled) { wall.stop(); bwd_auxiliary.stop(); bwd_auxiliary_gwss.stop(); const auto wall_iterations = (num_iterations > 1 ? num_iterations - 1 : 1); std::cout << "Wall-clock Time Backward Data Conv. Elapsed: " << (wall.gettime_ms() / wall_iterations) << " ms" << ", Auxiliary API calls: " << bwd_auxiliary.gettime_ms() << " ms" << " (GWSS: " << bwd_auxiliary_gwss.gettime_ms() << ')' << std::endl; } if(time_enabled) { std::cout << "MIOpen Backward Data Conv. " << AlgorithmSolutionToString(*selected) << std::endl; PrintBackwardDataTime(kernel_total_time, kernel_first_time); } is_bwd_igemm = (selected->algorithm == miopenConvolutionAlgoImplicitGEMM); din_dev->FromGPU(GetStream(), din.data()); return rc; } template int ConvDriver::RunBackwardWrwGpuImmed() { std::size_t count; wrw_auxiliary.resume(wall_enabled); auto rc = miopenConvolutionBackwardWeightsGetSolutionCount( handle, outputTensor, inputTensor, convDesc, weightTensor, &count); wrw_auxiliary.pause(wall_enabled); if(rc != miopenStatusSuccess) return rc; if(count < 1) return miopenStatusNotImplemented; auto solutions = std::vector(count); wrw_auxiliary.resume(wall_enabled); rc = miopenConvolutionBackwardWeightsGetSolution( handle, outputTensor, inputTensor, convDesc, weightTensor, count, &count, solutions.data()); wrw_auxiliary.pause(wall_enabled); if(rc != miopenStatusSuccess) return rc; std::cout << "Backward Weights Conv solutions available: " << count << std::endl; if(count < 1) return miopenStatusNotImplemented; solutions.resize(count); const miopenConvSolution_t* selected = nullptr; for(const auto& s : solutions) PrintImmedSolutionInfo(s); if(*immediate_solution == 0) selected = &solutions.front(); else for(const auto& s : solutions) if(*immediate_solution == s.solution_id) { selected = &s; break; } miopenConvSolution_t voluntary = { -1.0, 0, *immediate_solution, static_cast(-1)}; if(selected == nullptr) { std::cout << "Warning: Solution id (" << *immediate_solution << ") is not reported by the library. Trying it anyway..." << std::endl; selected = &voluntary; } std::size_t ws_size; wrw_auxiliary.resume(wall_enabled); wrw_auxiliary_gwss.resume(wall_enabled); rc = miopenConvolutionBackwardWeightsGetSolutionWorkspaceSize( handle, outputTensor, inputTensor, convDesc, weightTensor, selected->solution_id, &ws_size); wrw_auxiliary_gwss.pause(wall_enabled); wrw_auxiliary.pause(wall_enabled); if(rc != miopenStatusSuccess) return rc; #if MIOPEN_BACKEND_OPENCL cl_context ctx; clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); #elif MIOPEN_BACKEND_HIP uint32_t ctx = 0; #endif auto ws = std::unique_ptr{ws_size > 0 ? new GPUMem{ctx, ws_size, 1} : nullptr}; wrw_auxiliary.resume(wall_enabled); rc = miopenConvolutionBackwardWeightsCompileSolution( handle, outputTensor, inputTensor, convDesc, weightTensor, selected->solution_id); wrw_auxiliary.pause(wall_enabled); float kernel_total_time = 0.0; float kernel_first_time = 0.0; wall.start(wall_enabled); for(int i = 0; i < num_iterations; i++) { rc = miopenConvolutionBackwardWeightsImmediate(handle, outputTensor, dout_dev->GetMem(), inputTensor, in_dev->GetMem(), convDesc, weightTensor, dwei_dev->GetMem(), ws ? ws->GetMem() : nullptr, ws_size, selected->solution_id); if(rc != miopenStatusSuccess) return rc; if(time_enabled) { float time = 0.0; miopenGetKernelTime(GetHandle(), &time); kernel_total_time += time; if(i == 0) { kernel_first_time = time; if(wall_enabled && num_iterations > 1) wall.start(); } } } if(wall_enabled) { wall.stop(); wrw_auxiliary.stop(); wrw_auxiliary_gwss.stop(); const auto wall_iterations = (num_iterations > 1 ? num_iterations - 1 : 1); std::cout << "Wall-clock Time Backward Weights Conv. Elapsed: " << (wall.gettime_ms() / wall_iterations) << " ms" << ", Auxiliary API calls: " << wrw_auxiliary.gettime_ms() << " ms" << " (GWSS: " << wrw_auxiliary_gwss.gettime_ms() << ')' << std::endl; } if(time_enabled) { std::cout << "MIOpen Backward Weights Conv. " << AlgorithmSolutionToString(*selected) << std::endl; PrintBackwardWrwTime(kernel_total_time, kernel_first_time); } is_wrw_winograd = (selected->algorithm == miopenConvolutionAlgoWinograd); is_wrw_igemm = (selected->algorithm == miopenConvolutionAlgoImplicitGEMM); dwei_dev->FromGPU(GetStream(), dwei.data()); return rc; } template int ConvDriver::RunBackwardWeightsCPU() { if(miopen::deref(convDesc).mode == miopenTranspose) { cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(), dout, dwei_host, in, miopen::deref(convDesc).GetConvPads(), miopen::deref(convDesc).GetConvStrides(), miopen::deref(convDesc).GetConvDilations(), miopen::deref(convDesc).GetGroupCount()); } else { cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(), in, dwei_host, dout, miopen::deref(convDesc).GetConvPads(), miopen::deref(convDesc).GetConvStrides(), miopen::deref(convDesc).GetConvDilations(), miopen::deref(convDesc).GetGroupCount()); } if(inflags.GetValueInt("dump_output")) { dumpBufferToFile( "dump_bwd_dwei_cpu.bin", dwei_host.data.data(), dwei_host.data.size()); } TrySaveVerificationCache(Direction::WrW, dwei_host.data); return 0; } template int ConvDriver::RunBackwardDataCPU() { if(miopen::deref(convDesc).mode == miopenTranspose) { cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(), dout, wei, din_host, miopen::deref(convDesc).GetConvPads(), miopen::deref(convDesc).GetConvStrides(), miopen::deref(convDesc).GetConvDilations(), miopen::deref(convDesc).GetGroupCount()); } else { cpu_convolution_backward_data(miopen::deref(convDesc).GetSpatialDimension(), din_host, wei, dout, miopen::deref(convDesc).GetConvPads(), miopen::deref(convDesc).GetConvStrides(), miopen::deref(convDesc).GetConvDilations(), miopen::deref(convDesc).GetGroupCount()); } if(inflags.GetValueInt("dump_output")) { dumpBufferToFile("dump_bwd_din_cpu.bin", din_host.data.data(), din_host.data.size()); } TrySaveVerificationCache(Direction::Bwd, din_host.data); return 0; } template int ConvDriver::RunBackwardBiasCPU() { cpu_bias_backward_data(dout, db_host); if(inflags.GetValueInt("dump_output")) { dumpBufferToFile("dump_bwd_db_cpu.bin", db_host.data.data(), db_host.data.size()); } TrySaveVerificationCache(Direction::BwdBias, db_host.data); return 0; } template int ConvDriver::RunBackwardWeightsGPUReference() { AutoConvDirectNaiveAlwaysEnable naive_conv_enable; auto ref_solution_id = miopen::solver::Id("ConvDirectNaiveConvWrw").Value(); auto rc = miopenConvolutionBackwardWeightsImmediate(handle, outputTensor, dout_dev->GetMem(), inputTensor, in_dev->GetMem(), convDesc, weightTensor, dwei_dev->GetMem(), nullptr, 0, ref_solution_id); if(rc != miopenStatusSuccess) { std::cout << "reference kernel fail to run " << miopen::solver::Id(ref_solution_id).ToString() << std::endl; return rc; } if(miopen_type{} == miopen_type{}) dwei_dev->FromGPU(GetStream(), dwei_host.data.data()); else { auto dwei_tmp = tensor(miopen::deref(weightTensor).GetLengths(), miopen::deref(weightTensor).GetStrides()); dwei_dev->FromGPU(GetStream(), dwei_tmp.data.data()); for(int i = 0; i < dwei_tmp.data.size(); i++) { dwei_host.data[i] = static_cast(dwei_tmp.data[i]); } } if(inflags.GetValueInt("dump_output")) { dumpBufferToFile( "dump_bwd_dwei_gpu_ref.bin", dwei_host.data.data(), dwei_host.data.size()); } // TrySaveVerificationCache(Direction::WrW, dwei_host.data); return 0; } template int ConvDriver::RunBackwardDataGPUReference() { AutoConvDirectNaiveAlwaysEnable naive_conv_enable; auto ref_solution_id = miopen::deref(convDesc).mode == miopenTranspose ? miopen::solver::Id("ConvDirectNaiveConvFwd").Value() : miopen::solver::Id("ConvDirectNaiveConvBwd").Value(); auto rc = miopenConvolutionBackwardDataImmediate(handle, outputTensor, dout_dev->GetMem(), weightTensor, wei_dev->GetMem(), convDesc, inputTensor, din_dev->GetMem(), nullptr, 0, ref_solution_id); if(rc != miopenStatusSuccess) { std::cout << "reference kernel fail to run " << miopen::solver::Id(ref_solution_id).ToString() << std::endl; return rc; } if(miopen_type{} == miopen_type{}) din_dev->FromGPU(GetStream(), din_host.data.data()); else { auto din_tmp = tensor(miopen::deref(inputTensor).GetLengths(), miopen::deref(inputTensor).GetStrides()); din_dev->FromGPU(GetStream(), din_tmp.data.data()); for(int i = 0; i < din_tmp.data.size(); i++) { din_host.data[i] = static_cast(din_tmp.data[i]); } } if(inflags.GetValueInt("dump_output")) { dumpBufferToFile( "dump_bwd_din_gpu_ref.bin", din_host.data.data(), din_host.data.size()); } // TrySaveVerificationCache(Direction::Bwd, din_host.data); return 0; } template std::string ConvDriver::GetVerificationCacheFileName( const ConvDriver::Direction& direction) const { std::ostringstream ss; miopenConvolutionMode_t mode; int spatial_dim = inflags.GetValueInt("spatial_dim"); std::vector pads(spatial_dim); std::vector conv_strides(spatial_dim); std::vector conv_dilations(spatial_dim); std::vector trans_output_pads(spatial_dim); miopenGetConvolutionNdDescriptor(convDesc, spatial_dim, &spatial_dim, pads.data(), conv_strides.data(), conv_dilations.data(), &mode); auto get_basename_string = [&]() { switch(direction) { case Direction::Fwd: return "conv_fwd_out"; case Direction::Bwd: return "conv_bwd_dat"; case Direction::WrW: return "conv_bwd_wei"; case Direction::BwdBias: return "bias_bwd_dat"; } return ""; // For gcc. }; auto get_datatype_string = [](auto type) { if(std::is_same::value) { return "int8"; } else if(std::is_same::value) { return "float16"; } else if(std::is_same::value) { return "float"; } else if(std::is_same::value) { return "double"; } else if(std::is_same::value) { return "bfloat16"; } else { MIOPEN_THROW("unknown data type"); } }; ss << get_basename_string(); ss << "_" << mode; ss << "_" << spatial_dim; ss << "_" << miopen::deref(convDesc).paddingMode; ss << "_" << miopen::deref(convDesc).GetGroupCount(); miopen::LogRange(ss << "_", miopen::deref(inputTensor).GetLengths(), "x"); miopen::LogRange(ss << "_", miopen::deref(weightTensor).GetLengths(), "x"); miopen::LogRange(ss << "_", pads, "x"); miopen::LogRange(ss << "_", conv_strides, "x"); miopen::LogRange(ss << "_", conv_dilations, "x"); miopen::LogRange(ss << "_", trans_output_pads, "x"); ss << "_" << inflags.GetValueInt("pad_val"); ss << "_" << inflags.GetValueInt("bias"); ss << "_" << "GPU" << get_datatype_string(Tgpu{}); ss << "_" << "REF" << get_datatype_string(Tref{}); return ss.str(); } template bool ConvDriver::TryReadVerificationCache( const ConvDriver::Direction& direction, miopenTensorDescriptor_t& tensorDesc, Tref* data) const { const auto verification_cache_path = inflags.GetValueStr("verification_cache"); if(!verification_cache_path.empty()) { const auto file_path = verification_cache_path + "/" + GetVerificationCacheFileName(direction); if(std::ifstream(file_path).good()) { if(readBufferFromFile(data, GetTensorSize(tensorDesc), file_path.c_str())) { return true; } } } return false; } template void ConvDriver::TrySaveVerificationCache( const ConvDriver::Direction& direction, std::vector& data) const { const auto verification_cache_path = inflags.GetValueStr("verification_cache"); if(!verification_cache_path.empty()) { const auto file_path = verification_cache_path + "/" + GetVerificationCacheFileName(direction); dumpBufferToFile(file_path.c_str(), data.data(), data.size()); } } template int ConvDriver::VerifyForward() { if(!is_fwd) return 0; if(!is_fwd_run_failed) if(!TryReadVerificationCache(Direction::Fwd, outputTensor, outhost.data.data())) { if(UseGPUReference()) RunForwardGPUReference(); else RunForwardCPU(); } const auto isInt8 = (data_type == miopenInt8 || data_type == miopenInt8x4); auto error = is_fwd_run_failed ? std::numeric_limits::max() : (isInt8 ? miopen::rms_range(outhost.data, out_int8) : miopen::rms_range(outhost.data, out.data)); auto tolerance = GetDefaultTolerance(); // iGemm's deviation is higher than other algorithms. // The reason is most likely different order of computations. if(is_fwd_igemm) tolerance = tolerance * 10; if(!std::isfinite(error) || error > tolerance) { std::cout << "Forward Convolution FAILED: " << error << " > " << tolerance << std::endl; return EC_VerifyFwd; } std::cout << "Forward Convolution Verifies OK on " << (UseGPUReference() ? "GPU" : "CPU") << " reference (" << error << ')' << std::endl; return 0; } template int ConvDriver::VerifyBackward() { if(data_type == miopenInt8 || data_type == miopenInt8x4) { std::cout << "Int8 Backward Convolution is not supported" << std::endl; return 0; } if(!(is_bwd || is_wrw)) return 0; int cumulative_rc = 0; if(is_bwd) { if(!is_bwd_run_failed) if(!TryReadVerificationCache(Direction::Bwd, inputTensor, din_host.data.data())) { if(UseGPUReference()) RunBackwardDataGPUReference(); else RunBackwardDataCPU(); } auto error_data = is_bwd_run_failed ? std::numeric_limits::max() : miopen::rms_range(din_host.data, din); auto tolerance = GetDefaultTolerance(); // iGemm's deviation is higher than other algorithms. // The reason is most likely different order of computations. if(is_bwd_igemm) tolerance = tolerance * 10; if(!std::isfinite(error_data) || error_data > tolerance) { std::cout << "Backward Convolution Data FAILED: " << error_data << " > " << tolerance << std::endl; cumulative_rc |= EC_VerifyBwd; } else { std::cout << "Backward Convolution Data Verifies OK on " << (UseGPUReference() ? "GPU" : "CPU") << " reference (" << error_data << ')' << std::endl; } } if(is_wrw) { if(!is_wrw_run_failed) if(!TryReadVerificationCache(Direction::WrW, weightTensor, dwei_host.data.data())) { if(UseGPUReference()) RunBackwardWeightsGPUReference(); else RunBackwardWeightsCPU(); } // WrW deviation is ~twice worse than Bwd due to more FP computations involved, // which means more roundings, so GPU amd CPU computations diverge more. auto tolerance = 2 * GetDefaultTolerance(); // Winograd and iGemm WrW algorithms reveal bigger deviation than other algos. if(is_wrw_winograd && std::is_same::value) { tolerance *= 10; } else if(is_wrw_igemm) { if(std::is_same::value) #if WORKAROUND_ISSUE_2176 tolerance = 0.01; #else tolerance *= 10; #endif else if(std::is_same::value) tolerance *= 5; } auto error_weights = is_wrw_run_failed ? std::numeric_limits::max() : miopen::rms_range(dwei_host.data, dwei); if(!std::isfinite(error_weights) || error_weights > tolerance) { std::cout << "Backward Convolution Weights FAILED: " << error_weights << " > " << tolerance << std::endl; cumulative_rc |= EC_VerifyWrw; } else { std::cout << "Backward Convolution Weights Verifies OK on " << (UseGPUReference() ? "GPU" : "CPU") << " reference (" << error_weights << ')' << std::endl; } } if(inflags.GetValueInt("bias") != 0) { if(!TryReadVerificationCache(Direction::BwdBias, biasTensor, db_host.data.data())) { RunBackwardBiasCPU(); } auto error_bias = miopen::rms_range(db_host.data, db); const auto tolerance = GetDefaultTolerance(); if(!std::isfinite(error_bias) || error_bias > tolerance) { std::cout << "Backward Convolution Bias FAILED: " << error_bias << " > " << tolerance << std::endl; cumulative_rc |= EC_VerifyBwdBias; } else { std::cout << "Backward Convolution Bias Verifies OK on " << (UseGPUReference() ? "GPU" : "CPU") << " reference (" << error_bias << ')' << std::endl; } } return cumulative_rc; } #endif // GUARD_MIOPEN_CONV_DRIVER_HPP