/*******************************************************************************
 *
 * MIT License
 *
 * Copyright (c) 2021 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 *******************************************************************************/

#include <cstddef>
#include <miopen/solver.hpp>
#include <miopen/handle.hpp>
#include <miopen/generic_search.hpp>
#include <miopen/conv/wrw_invoke_params.hpp>
#include <miopen/solver/implicitgemm_util.hpp>
#include <miopen/gcn_asm_utils.hpp>
#include <miopen/tensor_ops.hpp>
#include <miopen/conv/asm_implicit_gemm.hpp>
#include <miopen/util_sol.hpp>

MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_GTC_XDLOPS_NHWC)
MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_PK_ATOMIC_ADD_FP16)

#define WRW_MAX_GEMM_K_SPLITS 10

namespace miopen {
namespace solver {

static inline std::size_t GetTypeSize(const std::string& s)
{
    if(s == "fp32")
        return miopen::GetTypeSize(miopenFloat);
    if(s == "fp16")
        return miopen::GetTypeSize(miopenHalf);
    else
        return miopen::GetTypeSize(miopenBFloat16);
}

static const inline std::vector<PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC>&
GetWrwXdlopsNHWCConfigList()
{
    // clang-format off
    static const  std::vector<PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC> kernel_param_list {
        {"wrw", "nhwc", miopenFloat,  0, 0, 256, 128,  16, 32, 32,  2, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 256, 128,  16, 32, 32,  2, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 256, 128,  16, 32, 32,  2, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 256, 128,  16, 32, 32,  2, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 128, 256,  16, 32, 32,  2, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 128, 256,  16, 32, 32,  2, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 128, 256,  16, 32, 32,  2, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 128, 256,  16, 32, 32,  2, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 128, 128,  16, 32, 32,  2, 1, 2, 1, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 128, 128,  16, 32, 32,  2, 1, 2, 1, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 128, 128,  16, 32, 32,  2, 1, 2, 1, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 128, 128,  16, 32, 32,  2, 1, 2, 1, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 256,  64,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 256,  64,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 256,  64,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 256,  64,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  64, 256,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  64, 256,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  64, 256,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  64, 256,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 128,  64,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 128,  64,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 128,  64,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 128,  64,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  64, 128,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  64, 128,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  64, 128,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  64, 128,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 256,  32,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 256,  32,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 256,  32,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 256,  32,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  32, 256,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  32, 256,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  32, 256,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  32, 256,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  64,  64,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  64,  64,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  64,  64,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  64,  64,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 128,  32,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 128,  32,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 128,  32,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 128,  32,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  32, 128,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  32, 128,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  32, 128,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  32, 128,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  64,  32,  16, 16, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  64,  32,  16, 16, 16,  4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  64,  32,  16, 16, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  64,  32,  16, 16, 16,  4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  32,  64,  16, 16, 16,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  32,  64,  16, 16, 16,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  32,  64,  16, 16, 16,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  32,  64,  16, 16, 16,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  32,  32,  32, 16, 16,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 32,  1,  8}, { 1, 1, 1, 4}, {  1, 32,  1,  8}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  32,  32,  32, 16, 16,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 32,  1,  8}, { 1, 1, 1, 4}, {  1, 32,  1,  8}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  32,  32,  32, 16, 16,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 32,  1,  8}, { 1, 1, 1, 4}, {  1, 32,  1,  8}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  32,  32,  32, 16, 16,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 32,  1,  8}, { 1, 1, 1, 4}, {  1, 32,  1,  8}},

        {"wrw", "nhwc", miopenHalf,  0, 1, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256, 128,  16, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256, 128,  16, 32, 32,  8, 2, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256, 128,  16, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256, 128,  16, 32, 32,  8, 2, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 256,  16, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 256,  16, 32, 32,  8, 1, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 256,  16, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 256,  16, 32, 32,  8, 1, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 128,  16, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 128,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 128,  16, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 128,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256,  64,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256,  64,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 256,  16, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 256,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 256,  16, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 256,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 128,  16, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 128,  16, 32, 32,  8, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 128,  16, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 128,  16, 32, 32,  8, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64,  64,  32, 32, 32,  8, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64,  64,  16, 32, 32,  8, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64,  32,  32, 16, 16, 16, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64,  32,  32, 16, 16, 16, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64,  32,  32, 16, 16, 16, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64,  32,  32, 16, 16, 16, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},

        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256, 128,  16, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256, 128,  16, 32, 32,  8, 2, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256, 128,  16, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256, 128,  16, 32, 32,  8, 2, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 256,  16, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 256,  16, 32, 32,  8, 1, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 256,  16, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 256,  16, 32, 32,  8, 1, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 128,  16, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 128,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 128,  16, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 128,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256,  64,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256,  64,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 256,  16, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 256,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 256,  16, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 256,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 128,  16, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 128,  16, 32, 32,  8, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 128,  16, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 128,  16, 32, 32,  8, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64,  64,  32, 32, 32,  8, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64,  64,  16, 32, 32,  8, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64,  32,  32, 16, 16, 16, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64,  32,  32, 16, 16, 16, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64,  32,  32, 16, 16, 16, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64,  32,  32, 16, 16, 16, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
    };
    return kernel_param_list;
}

static std::tuple<std::string, // kernel_name
                  size_t,      // block_size
                  size_t,      // grid_size
                  size_t>      // occupancy
GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(
    const ConvolutionContext& ctx, const PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC& config)
{
    // const auto& n     = ctx.batch_sz;
    const auto& k = ctx.n_inputs;
    const auto& c = ctx.n_outputs;
    // const auto& ho    = ctx.in_height;
    // const auto& wo    = ctx.in_width;
    const auto& y     = ctx.kernel_size_h;
    const auto& x     = ctx.kernel_size_w;
    const auto& group = ctx.group_counts;

    // c need to be carefully padded
    const auto c_vec_min = config.tensor_b_thread_lengths[3];
    const auto c_padded  = ((c / group) + c_vec_min - 1) / c_vec_min * c_vec_min;
    const auto gemm_n = (c_padded * y * x + config.gemm_n_per_block - 1) / config.gemm_n_per_block *
                        config.gemm_n_per_block;

    const auto gemm_m = k / group;
    size_t block_size = config.BlockSize();
    size_t grid_size  = group * integer_divide_ceil(gemm_m, config.gemm_m_per_block) *
                       integer_divide_ceil(gemm_n, config.gemm_n_per_block);
    std::string kernel_name = config.ToKernelName(ctx);
    size_t occupancy        = config.ComputeKernelOccupancy();
    return std::make_tuple(kernel_name, block_size, grid_size, occupancy);
}

size_t PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::ComputeKernelOccupancy() const
{
    size_t acc_usage = gemm_m_per_block * gemm_n_per_block / BlockSize();
    size_t vgpr_usage;
    size_t aux_vgpr_usage;
    size_t a_elements_per_vgpr = 1;
    size_t b_elements_per_vgpr = 1;
    size_t lds_a               = gemm_m_per_block * gemm_k_per_block * GetTypeSize(precision);
    size_t lds_b               = gemm_n_per_block * gemm_k_per_block * GetTypeSize(precision);

    size_t lds_single = lds_a >= lds_b ? lds_a * 2 : lds_b * 2;
    size_t lds_usage;
    size_t occupancy;

    const auto lds_size        = 64 * 1024;
    const auto num_vpgrs       = 256;
    const auto num_acc         = 256;
    const auto half_lds_size   = lds_size / 2;
    const auto quater_lds_size = lds_size / 4;
    const auto eighth_lds_size = lds_size / 8;
    const auto half_acc        = num_acc / 2;
    const auto third_vpgrs     = num_vpgrs / 3;

    if(nxe == 0)
    {
        aux_vgpr_usage = 36;
    }
    else
    {
        aux_vgpr_usage = 42;
    }

    if(GetTypeSize(precision) == 2 && tensor_a_thread_lengths[3] > 1)
    {
        a_elements_per_vgpr = 2;
    }
    if(GetTypeSize(precision) == 2 && tensor_b_thread_lengths[3] > 1)
    {
        b_elements_per_vgpr = 2;
    }

    size_t sz_per_element = precision == "fp16" ? 2 : 1;

    vgpr_usage = tensor_a_thread_lengths[1] * tensor_a_thread_lengths[3] / a_elements_per_vgpr +
                 tensor_b_thread_lengths[1] * tensor_b_thread_lengths[3] / b_elements_per_vgpr +
                 tensor_a_thread_lengths[1] * tensor_a_thread_lengths[3] / sz_per_element +
                 tensor_b_thread_lengths[1] * tensor_b_thread_lengths[3] / sz_per_element +
                 aux_vgpr_usage;
    if(GetTypeSize(precision) == 2)
    {
        if(lds_single >= half_lds_size ||
           (lds_single <= quater_lds_size && lds_single > eighth_lds_size && acc_usage < half_acc &&
            vgpr_usage < third_vpgrs))
        {
            lds_usage = lds_single;
        }
        else
        {
            // use lds double buffer
            lds_usage = lds_single * 2;
        }
    }
    else
    {
        lds_usage = lds_single;
    }

    MIOPEN_LOG_T("lds_usage=" << lds_usage << ", acc_usage=" << acc_usage
                              << ", vgpr_usage=" << vgpr_usage);

    occupancy =
        std::min(lds_size / lds_usage, std::min(num_acc / acc_usage, num_vpgrs / vgpr_usage));
    return occupancy;
}

void PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::SetParamsForKSplit(const ConvolutionContext& ctx, const size_t& occupancy)
{
    if(ctx.IsFp16())
    {
        if(tensor_b_thread_lengths[3] == 1 ||
           miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_PK_ATOMIC_ADD_FP16{}))
            vector_store = 1;
    }
    else if(ctx.IsBfp16() && tensor_b_thread_lengths[3] == 1)
    {
        vector_store = 1;
    }
    gemm_k_global_split = occupancy;
}

void PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::HeuristicInit(const ConvolutionContext& ctx)
{
    static const std::vector<std::tuple<int, int, int>> tile_list_fp32 = {
        std::make_tuple(128, 128, 16),
        std::make_tuple(128, 64, 16),
        std::make_tuple(64, 128, 16),
        std::make_tuple(128, 32, 16),
        std::make_tuple(256, 64, 16),
        std::make_tuple(64, 256, 16),
        std::make_tuple(64, 64, 16),
        std::make_tuple(64, 32, 16),
        std::make_tuple(32, 64, 16),
        std::make_tuple(32, 32, 32),
    };

    static const std::vector<std::tuple<int, int, int>> tile_list_fp16 = {

        std::make_tuple(256, 256, 32), std::make_tuple(256, 128, 16), std::make_tuple(256, 128, 32),
        std::make_tuple(128, 256, 16), std::make_tuple(128, 256, 32), std::make_tuple(128, 128, 16),
        std::make_tuple(128, 128, 32), std::make_tuple(256, 64, 16),  std::make_tuple(256, 64, 32),
        std::make_tuple(64, 256, 16),  std::make_tuple(64, 256, 32),  std::make_tuple(128, 64, 32),
        std::make_tuple(64, 128, 16),  std::make_tuple(64, 128, 32),  std::make_tuple(64, 64, 64),
        std::make_tuple(64, 64, 32),   std::make_tuple(256, 32, 32),  std::make_tuple(32, 256, 32),
        std::make_tuple(64, 32, 32),   std::make_tuple(64, 64, 16),
    };

    static const std::vector<std::tuple<int, int, int>> tile_list_bfp16 = {

        std::make_tuple(256, 128, 16), std::make_tuple(256, 128, 32), std::make_tuple(128, 256, 16),
        std::make_tuple(128, 256, 32), std::make_tuple(128, 128, 16), std::make_tuple(128, 128, 32),
        std::make_tuple(256, 64, 16),  std::make_tuple(256, 64, 32),  std::make_tuple(64, 256, 16),
        std::make_tuple(64, 256, 32),  std::make_tuple(128, 64, 32),  std::make_tuple(64, 128, 16),
        std::make_tuple(64, 128, 32),  std::make_tuple(64, 64, 64),   std::make_tuple(64, 64, 32),
        std::make_tuple(256, 32, 32),  std::make_tuple(32, 256, 32),  std::make_tuple(64, 32, 32),
        std::make_tuple(64, 64, 16),
    };

#ifndef NDEBUG
    const auto& c_list = GetWrwXdlopsNHWCConfigList();
    for(const auto& tile : tile_list_fp16)
    {
        int mp, np, kp;
        std::tie(mp, np, kp) = tile;
        bool found           = false;
        for(const auto& config : c_list)
        {
            if(config.precision == "fp32" || config.precision == "bf16")
                continue;
            if(config.gemm_m_per_block == mp && config.gemm_n_per_block == np &&
               config.gemm_k_per_block == kp)
            {
                found = true;
                break;
            }
        }
        if(!found)
        {
            MIOPEN_LOG_E("fp16 list can't find " << mp << "x" << np << "x" << kp);
            MIOPEN_THROW(miopenStatusInternalError);
        }
    }
    for(const auto& tile : tile_list_fp32)
    {
        int mp, np, kp;
        std::tie(mp, np, kp) = tile;
        bool found           = false;
        for(const auto& config : c_list)
        {
            if(config.precision == "fp16" || config.precision == "bf16")
                continue;
            if(config.gemm_m_per_block == mp && config.gemm_n_per_block == np &&
               config.gemm_k_per_block == kp)
            {
                found = true;
                break;
            }
        }
        if(!found)
        {
            MIOPEN_LOG_E("fp32 list can't find " << mp << "x" << np << "x" << kp);
            MIOPEN_THROW(miopenStatusInternalError);
        }
    }
    for(const auto& tile : tile_list_bfp16)
    {
        int mp, np, kp;
        std::tie(mp, np, kp) = tile;
        bool found           = false;
        for(const auto& config : c_list)
        {
            if(config.precision == "fp16" || config.precision == "fp32")
                continue;
            if(config.gemm_m_per_block == mp && config.gemm_n_per_block == np &&
               config.gemm_k_per_block == kp)
            {
                found = true;
                break;
            }
        }
        if(!found)
        {
            MIOPEN_LOG_E("fp32 list can't find " << mp << "x" << np << "x" << kp);
            MIOPEN_THROW(miopenStatusInternalError);
        }
    }
#endif

    const auto& k         = ctx.n_inputs;
    const auto& c         = ctx.n_outputs;
    const auto& y         = ctx.kernel_size_h;
    const auto& x         = ctx.kernel_size_w;
    const auto stride_h   = ConvolutionContextInterpreter::GetAdjustedConvolutionStrideH(ctx);
    const auto stride_w   = ConvolutionContextInterpreter::GetAdjustedConvolutionStrideW(ctx);
    const auto dilation_h = ctx.kernel_size_h > 1 ? ctx.kernel_dilation_h : 1;
    const auto dilation_w = ctx.kernel_size_w > 1 ? ctx.kernel_dilation_w : 1;
    const auto& pad_h     = ctx.pad_h;
    const auto& pad_w     = ctx.pad_w;
    const auto& group     = ctx.group_counts;

    const auto num_cu             = ctx.GetStream().GetMaxComputeUnits();
    const auto non_split_gridsize = 600;

    auto gemm_n        = (c / group) * y * x;
    const auto& gemm_m = k / group;

    bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) &&
                     (dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0);
    bool not_support_vector_store = (ctx.IsFp16() || ctx.IsBfp16()) && ((c / group) % 2 != 0);
    int m_per_block, n_per_block, k_per_block;

    std::tie(m_per_block, n_per_block, k_per_block) = HeuristicInitMacroTileNoPadGemmK(
        gemm_m, gemm_n, 0, ctx.IsFp32() ? tile_list_fp32 : (ctx.IsFp16() ? tile_list_fp16 : tile_list_bfp16));

    auto find_with_gemm_k_pad = [&](){
        // not found, let's try  gemm_k pad now.
        const auto& config_list = GetWrwXdlopsNHWCConfigList();
        size_t min_pad_pixel    = std::numeric_limits<std::size_t>::max();
        size_t selected_index   = 0;
        for(size_t i = 0; i < config_list.size(); i++)
        {
            const auto& config = config_list[i];
            if(!((ctx.IsFp16() && config.precision == "fp16") ||
                 (ctx.IsBfp16() && config.precision == "bf16") ||
                 (ctx.IsFp32() && config.precision == "fp32")))
                continue;

            if(ctx.IsFp16() || ctx.IsBfp16())
            {
                if((c / group) % config.tensor_b_thread_lengths[3] != 0)
                {
                    continue;
                }
                if((k / group) % config.tensor_a_thread_lengths[3] != 0)
                {
                    continue;
                }
            }

            if(ctx.IsFp32())
            {
                // c need to be carefully padded
                const auto c_vec_min = config.tensor_b_thread_lengths[3];
                const auto c_padded  = ((c / group) + c_vec_min - 1) / c_vec_min * c_vec_min;
                gemm_n               = (c_padded * y * x + config.gemm_n_per_block - 1) /
                         config.gemm_n_per_block * config.gemm_n_per_block;
            }

            size_t cur_pad_pixel =
                ComputeMatrixPadSize(gemm_m, config.gemm_m_per_block, 0, config.gemm_k_per_block) +
                ComputeMatrixPadSize(gemm_n, config.gemm_n_per_block, 0, config.gemm_k_per_block) +
                ComputeMatrixPadSize(
                    gemm_m, config.gemm_m_per_block, gemm_n, config.gemm_n_per_block);
            if(cur_pad_pixel < min_pad_pixel)
            {
                min_pad_pixel  = cur_pad_pixel;
                selected_index = i;
            }
        }

        size_t current_grid_size;
        size_t occupancy;
        std::tie(std::ignore, std::ignore, current_grid_size, occupancy) =
            GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(ctx, config_list[selected_index]);
        bool need_k_split = current_grid_size <= non_split_gridsize;
        size_t gks = ComputeGemmKGlobalSplitsWith2DMerge(current_grid_size, occupancy, num_cu);
        need_k_split |= gks != 0;

        CopyParameters(config_list[selected_index]);
        if(need_k_split)
        {
            SetParamsForKSplit(ctx, occupancy);
        }
    };

    if((m_per_block == 0 && n_per_block == 0 && k_per_block == 0) || not_support_vector_store)
    {
        // not found, let's try gemm_k pad now.
        find_with_gemm_k_pad();
    }
    else
    {
        // found a suitable m/n/k, now let's prepare other parmater and initialize one
        const auto& config_list = GetWrwXdlopsNHWCConfigList();
        for(const auto& config : config_list)
        {
            if(!((ctx.IsFp16() && config.precision == "fp16") ||
                 (ctx.IsBfp16() && config.precision == "bf16") ||
                 (ctx.IsFp32() && config.precision == "fp32")))
                continue;

            if(m_per_block == config.gemm_m_per_block && n_per_block == config.gemm_n_per_block &&
               k_per_block == config.gemm_k_per_block)
            {
                size_t current_grid_size;
                size_t occupancy;
                std::tie(std::ignore, std::ignore, current_grid_size, occupancy) =
                    GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(ctx, config);
                bool need_k_split = current_grid_size <= non_split_gridsize;
                size_t gks =
                    ComputeGemmKGlobalSplitsWith2DMerge(current_grid_size, occupancy, num_cu);
                need_k_split |= gks != 0;

                if((unit_conv && config.nxe == 0) || (!unit_conv && config.nxe != 0))
                {
                    if(!config.IsValid(ctx)) // last check before assigning a heuristic value
                        continue;
                    CopyParameters(config);
                    if(need_k_split)
                    {
                        SetParamsForKSplit(ctx, occupancy);
                    }
                    return;
                }
                else
                    continue;
            }
        }
        // last try
        find_with_gemm_k_pad();
    }
}

bool PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::SetNextValue(
    const ConvolutionContext& /*config*/)
{
    if(use_spare_set)
    {
        const auto& config_list = GetWrwXdlopsNHWCConfigList();
        if(IsDefaultConstructed())
        {
            CopyParameters(config_list[index]);
        }
        else
        {
            if(gemm_k_global_split != 0)
            {
                if(NextLinear<1, WRW_MAX_GEMM_K_SPLITS>(gemm_k_global_split))
                    index++;
                else
                    return true;
            }
            else
            {
                index++;
            }
            if(index >= config_list.size())
                return false;
            CopyParameters(config_list[index]);
        }
        return true;
    }
    else
    {
        // always break generic search of main set (no spare), make sure we can use spare set
        return false;
    }
}
bool PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::IsValidValue() const
{
    if(IsDefaultConstructed())
        return true;
    const auto& config_list = GetWrwXdlopsNHWCConfigList();
    if(index >= config_list.size())
        return false;
    return *this == config_list[index];
}
bool PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::IsValid(const ConvolutionContext& ctx) const
{
    if(IsDefaultConstructed())
        return false;

    if(!((ctx.IsFp16() && precision == "fp16") || (ctx.IsFp32() && precision == "fp32") || (ctx.IsBfp16() && precision == "bf16")))
        return false;

    if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_PK_ATOMIC_ADD_FP16{}))
        if(ctx.IsFp16() && tensor_b_thread_lengths[3] != 1 && gemm_k_global_split != 0 && vector_store != 1)
            return false;

    const auto& k         = ctx.n_inputs;
    const auto& c         = ctx.n_outputs;
    const auto& y         = ctx.kernel_size_h;
    const auto& x         = ctx.kernel_size_w;
    const auto stride_h   = ConvolutionContextInterpreter::GetAdjustedConvolutionStrideH(ctx);
    const auto stride_w   = ConvolutionContextInterpreter::GetAdjustedConvolutionStrideW(ctx);
    const auto dilation_h = ctx.kernel_size_h > 1 ? ctx.kernel_dilation_h : 1;
    const auto dilation_w = ctx.kernel_size_w > 1 ? ctx.kernel_dilation_w : 1;
    const auto& pad_h     = ctx.pad_h;
    const auto& pad_w     = ctx.pad_w;
    const auto precision  = ctx.IsFp16() ? miopenHalf : (ctx.IsBfp16() ? miopenBFloat16 : miopenFloat);
    const auto& group     = ctx.group_counts;

    bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) &&
                     (dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0);

    if((nxe == 0) && !unit_conv)
    {
        return false;
    }

    if(precision != miopenFloat)
    {
        if((c / group) % tensor_b_thread_lengths[3] != 0)
        {
            return false;
        }
        if((k / group) % tensor_a_thread_lengths[3] != 0)
        {
            return false;
        }
    }

    // add more restriction for spare
    if(use_spare_set)
    {
        // non 1x1 kernel(except padding gemm_k) can't run 1x1 case
        if(unit_conv && nxe != 0)
            return false;
    }

    return true;
}

PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC
ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetPerformanceConfig(
    const ConvolutionContext& params) const
{
    PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC pp;
    pp.HeuristicInit(params);
    MIOPEN_LOG_I(pp.ToString());
    return pp;
}
bool ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::IsValidPerformanceConfig(
    const ConvolutionContext& problem,
    const PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC& config) const
{
    return config.IsValidValue() && config.IsValid(problem);
}
PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC
ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::Search(const ConvolutionContext& ctx,
                                                   const AnyInvokeParams& invoke_ctx) const
{
    return GenericSearch(*this, ctx, invoke_ctx);
}

bool ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::IsApplicable(const ConvolutionContext& ctx) const
{
    if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_GTC_XDLOPS_NHWC{}))
        return false;

    if(miopen::IsEnabled(MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC{}))
        return false;

    const auto device_name = ctx.GetStream().GetDeviceName();
    if((device_name != "gfx908") && (device_name != "gfx90a"))
        return false;

    if(!ctx.use_asm_kernels)
        return false;

    if(!ctx.direction.IsBackwardWrW())
        return false;

    if(!ctx.Is2d())
        return false;

    if(!ctx.IsFp32() && !ctx.IsFp16() && !(ctx.IsBfp16() && device_name == "gfx90a"))
        return false;

    if(!ctx.rmv.IsV3())
        return false;

    const auto target = ctx.GetStream().GetTargetProperties();
    if(target.Xnack() && *target.Xnack())
        return false; // NOLINT (readability-simplify-boolean-expr)

    if(0 == igemm_split_batch_size(ctx.out_height, 
                                   ctx.out_width, 
                                   ctx.in_height, 
                                   ctx.in_width, 
                                   ctx.batch_sz, 
                                   ctx.n_inputs, 
                                   ctx.n_outputs, 
                                   miopen::GetTypeSize(ctx.in_data_type)))
        return false;

    return true;
}

inline std::vector<OpKernelArg>
ComputeDynamicIGemmWrwKernelArgsNHWC(const conv::ProblemDescription& conv_problem,
                                     const int gemm_k_global_splits,
                                     const int gemm_k_per_wg,
                                     const int splits_4G)
{
    int hi         = conv_problem.GetOutHeight();
    int wi         = conv_problem.GetOutWidth();
    int n          = conv_problem.GetInBatchSize();
    int k          = conv_problem.GetInChannels();
    int c          = conv_problem.GetOutChannels();
    int ho         = conv_problem.GetInHeight();
    int wo         = conv_problem.GetInWidth();
    int stride_h   = conv_problem.GetInHeight() > 1 ? conv_problem.GetKernelStrideH() : 1;
    int stride_w   = conv_problem.GetInWidth() > 1 ? conv_problem.GetKernelStrideW() : 1;
    int dilation_h = conv_problem.GetWeightsHeight() > 1 ? conv_problem.GetDilationH() : 1;
    int dilation_w = conv_problem.GetWeightsWidth() > 1 ? conv_problem.GetDilationW() : 1;
    int pad_h      = conv_problem.GetPadH();
    int pad_w      = conv_problem.GetPadW();
    int y          = conv_problem.GetWeightsHeight();
    int x          = conv_problem.GetWeightsWidth();
    int group      = conv_problem.GetGroupCount();

    std::vector<OpKernelArg> opArgs;
    opArgs.emplace_back(0); // placeholder
    opArgs.emplace_back(0); // placeholder
    opArgs.emplace_back(0); // placeholder
    opArgs.emplace_back(hi);
    opArgs.emplace_back(wi);
    opArgs.emplace_back(n / splits_4G);
    opArgs.emplace_back(k / group);
    opArgs.emplace_back(c / group);
    opArgs.emplace_back(ho);
    opArgs.emplace_back(wo);
    opArgs.emplace_back(stride_h);
    opArgs.emplace_back(stride_w);
    opArgs.emplace_back(dilation_h);
    opArgs.emplace_back(dilation_w);
    opArgs.emplace_back(pad_h);
    opArgs.emplace_back(pad_w);
    opArgs.emplace_back(y);
    opArgs.emplace_back(x);
    opArgs.emplace_back(gemm_k_global_splits);
    opArgs.emplace_back(group);
    opArgs.emplace_back(gemm_k_per_wg);

    return opArgs;
}

size_t
ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetWorkspaceSize(const ConvolutionContext& ctx) const
{
    const auto& hi        = ctx.out_height;
    const auto& wi        = ctx.out_width;
    const auto& n         = ctx.batch_sz;
    const auto& k         = ctx.n_inputs;
    const auto& c         = ctx.n_outputs;
    const auto& ho        = ctx.in_height;
    const auto& wo        = ctx.in_width;
    const auto& y         = ctx.kernel_size_h;
    const auto& x         = ctx.kernel_size_w;
    const auto& group     = ctx.group_counts;
    const auto is_nchw     = ctx.IsLayoutDefault();

    size_t size_trans_input  = 0;
    size_t size_trans_weight = 0;
    size_t size_trans_output = 0;
    size_t size_tensor_cast  = 0;

    constexpr size_t buf_alignment     = 256;

    size_t workspace_size = 0;
    if(is_nchw)
    {
        TransposeSolutionDefault2Nhwc trans_input(ctx, ctx.out_data_type, n, c, hi, wi);
        TransposeSolutionNhwc2Default trans_weight(ctx,
                                                 ctx.weights_data_type,
                                                 k,
                                                 c / group,
                                                 y,
                                                 x); // group * k_per_group as batch for weight
        TransposeSolutionDefault2Nhwc trans_output(ctx, ctx.in_data_type, n, k, ho, wo);
        if(!trans_input.IsSkippable())
            size_trans_input  = trans_input.GetSize();
        if(!trans_weight.IsSkippable())
            size_trans_weight = trans_weight.GetSize();
        if(!trans_output.IsSkippable())
            size_trans_output = trans_output.GetSize();

    }

    if(!ctx.IsFp32())
        size_tensor_cast = miopen::GetTypeSize(miopenFloat) // The intermediate output of the 1st
                                                           // kernel is FP32, when using FP32 atomic
                           * (k / group) * c * y * x;

    MultiBufferWorkspaceTraits wt({size_trans_input, size_trans_weight, size_trans_output, size_tensor_cast}, buf_alignment);
    workspace_size = wt.GetSize();

    return workspace_size;
}

ConvSolution ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetSolution(
    const ConvolutionContext& ctx,
    const PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC& config,
    bool) const
{
    ConvSolution result;
    KernelInfo kernel;

    std::string kernel_name;
    size_t block_size;
    size_t grid_size;

    std::tie(kernel_name, block_size, grid_size, std::ignore) =
        GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(ctx, config);

    const auto& hi        = ctx.out_height;
    const auto& wi        = ctx.out_width;
    const auto& n         = ctx.batch_sz;
    const auto& k         = ctx.n_inputs;
    const auto& c         = ctx.n_outputs;
    const auto& ho        = ctx.in_height;
    const auto& wo        = ctx.in_width;
    const auto& y         = ctx.kernel_size_h;
    const auto& x         = ctx.kernel_size_w;
    const auto& group     = ctx.group_counts;

    auto splits_4G = igemm_split_batch_size(hi, wi, ho, wo, n, k, c, miopen::GetTypeSize(ctx.in_data_type));

    size_t gemm_k_global_splits =
        config.gemm_k_global_split >= 1
            ? ComputeGemmKGlobalSplitsWith2DMerge(
                  grid_size, config.gemm_k_global_split, ctx.GetStream().GetMaxComputeUnits())
            : 1;
    size_t min_n_per_block = config.nxe == 1 ? config.tensor_a_thread_lengths[1] : 1;
    size_t nb_per_block =
        config.nxe == 1 ? config.tensor_a_cluster_lengths[1] : config.gemm_k_per_block;

    if(gemm_k_global_splits == 0)
        gemm_k_global_splits = 1;

    // compute workload for 1 workgroup and update gemmk splits (remove the ones compute 0 data)
    size_t gemmk = integer_divide_ceil(static_cast<size_t>(ctx.batch_sz / splits_4G), min_n_per_block) *
                   ctx.in_height * ctx.in_width;
    size_t gemmk_per_wg = integer_divide_ceil(gemmk, gemm_k_global_splits);

    gemmk_per_wg         = (gemmk_per_wg + nb_per_block - 1) / nb_per_block * nb_per_block;
    gemm_k_global_splits = integer_divide_ceil(gemmk, gemmk_per_wg);

    const auto required_workspace_size = GetWorkspaceSize(ctx);
    result.workspace_sz                 = required_workspace_size;

    kernel.kernel_file = kernel_name + ".s";
    kernel.kernel_name = kernel_name;
    kernel.g_wk.clear();
    kernel.g_wk.push_back(grid_size * block_size);
    kernel.g_wk.push_back(splits_4G);
    kernel.g_wk.push_back(gemm_k_global_splits);
    kernel.l_wk.clear();
    kernel.l_wk.push_back(block_size);
    kernel.l_wk.push_back(1);
    kernel.l_wk.push_back(1);

    const auto& conv_problem          = ctx.conv_problem;
    const auto isFp16                 = conv_problem.IsFp16();
    const auto isGfx90aFp16altSupport = (ctx.GetStream().GetDeviceName() == "gfx90a") && isFp16;
    const bool need_cast = (conv_problem.IsBfp16() && gemm_k_global_splits >= 1) || (isFp16 && gemm_k_global_splits >= 1 && (config.tensor_b_thread_lengths[3] == 1 || config.vector_store == 1));

    const auto is_nchw     = ctx.IsLayoutDefault();

    result.construction_params.push_back(kernel); // Intentionally without options.
    std::ostringstream options;                   // Common options for both kernels.
    GenerateClangDefsym(options, "ROCM_METADATA_VERSION", ctx.rmv.UseV3() ? 5 : 4);

    std::ostringstream opts_0(options.str(), std::ios_base::ate); // Options for normal kernel.
    if(isGfx90aFp16altSupport)
        GenerateClangDefsym(opts_0, "igemm_wrw_fp16_alt_impl", 0);
    result.construction_params[0].comp_options = opts_0.str();
    std::ostringstream msg;

    if(isGfx90aFp16altSupport)
    {
        result.construction_params.push_back(kernel);
        std::ostringstream opts_1(options.str(), std::ios_base::ate); // Options for alt kernel.
        GenerateClangDefsym(opts_1, "igemm_wrw_fp16_alt_impl", 1);
        result.construction_params[1].comp_options = opts_1.str();
        if(miopen::IsLogging(LoggingLevel::Info2))
            msg << ", fp16_alt:" <<ctx.conv_problem.GetConv().attribute.gfx90aFp16alt.GetWrW();
    }

    const auto lowp_quant = conv_problem.GetConv().lowp_quant;

    auto opArgs =
        ComputeDynamicIGemmWrwKernelArgsNHWC(conv_problem, gemm_k_global_splits, gemmk_per_wg, splits_4G);
    std::vector<std::vector<OpKernelArg>> opArgsTrans;
    size_t trans_input_offset = 0;
    size_t trans_input_size   = 0;

    size_t trans_weight_offset = 0;
    size_t trans_weight_size   = 0;

    size_t trans_output_offset = 0;
    size_t trans_output_size   = 0;

    bool trans_input_skippable  = false;
    bool trans_weight_skippable = false;
    bool trans_output_skippable = false;

    int trans_input_idx  = -1;
    int trans_weight_idx = -1;
    int trans_output_idx = -1;

    constexpr size_t buf_alignment = 256;
    
    if(is_nchw)
    {
        TransposeSolutionDefault2Nhwc trans_input(ctx, ctx.out_data_type, n, c, hi, wi);
        TransposeSolutionNhwc2Default trans_weight(ctx,
                                                 ctx.weights_data_type,
                                                 k,
                                                 c / group,
                                                 y,
                                                 x); // group * k_per_group as batch for weight
        TransposeSolutionDefault2Nhwc trans_output(ctx, ctx.in_data_type, n, k, ho, wo);

        trans_input_skippable  = trans_input.IsSkippable();
        trans_weight_skippable = trans_weight.IsSkippable();
        trans_output_skippable = trans_output.IsSkippable();

        if(!trans_input_skippable){
            result.construction_params.push_back(trans_input.GetKernel());
            opArgsTrans.emplace_back(trans_input.GetKernelArg());
            if(miopen::IsLogging(LoggingLevel::Info2))
                msg << ", inp trans:"<<trans_input.GetKernelName();
        }
        if(!trans_weight_skippable){
            result.construction_params.push_back(trans_weight.GetKernel());
            opArgsTrans.emplace_back(trans_weight.GetKernelArg());
            if(miopen::IsLogging(LoggingLevel::Info2))
                msg << ", wei trans:"<<trans_weight.GetKernelName();
        }
        if(!trans_output_skippable){
            result.construction_params.push_back(trans_output.GetKernel());
            opArgsTrans.emplace_back(trans_output.GetKernelArg());
            if(miopen::IsLogging(LoggingLevel::Info2))
                msg << ", out trans:"<<trans_output.GetKernelName();
        }

        trans_input_size  = trans_input_skippable ? 0 : trans_input.GetSize();
        trans_weight_size = trans_weight_skippable ? 0 : trans_weight.GetSize();
        trans_output_size = trans_output_skippable ? 0 : trans_output.GetSize();

        int idx = 0;
        if(!trans_input_skippable)
            trans_input_idx = idx++;
        if(!trans_weight_skippable)
            trans_weight_idx = idx++;
        if(!trans_output_skippable)
            trans_output_idx = idx++;
    }

    MIOPEN_LOG_I2(SolverDbId() << ": " << config.ToString() << msg.str());

    const size_t cast_size = need_cast ?
        miopen::GetTypeSize(miopenFloat) * k * (c / group) * y * x  : 0;

    MultiBufferWorkspaceTraits wt({trans_input_size, trans_weight_size, trans_output_size, cast_size}, buf_alignment);

    trans_input_offset  = wt.GetOffset(0);
    trans_weight_offset = wt.GetOffset(1);
    trans_output_offset = wt.GetOffset(2);

    const size_t cast_offset = wt.GetOffset(3);

    const int kID_trans_start = isGfx90aFp16altSupport ? 2 : 1;

    const TensorDescriptor cast_desc(miopenFloat, ctx.conv_problem.GetWeights().GetLengths(), ctx.conv_problem.GetWeights().GetStrides());
    auto null_buf = shared<Data_t> {};

    if(need_cast)
    {
        result.invoker_factory = [=](const std::vector<Kernel>& kernels) mutable {
            return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) mutable {
                decltype(auto) wrw_invoke_params =
                    primitive_parameters.CastTo<conv::WrWInvokeParams>();
                const auto& tensors = wrw_invoke_params.tensors;
                const auto ker        = handle.Run(
                    kernels[(isGfx90aFp16altSupport && wrw_invoke_params.gfx90aFp16alt) ? 1 : 0]);
                const auto& workSpace     = wrw_invoke_params.workSpace;
                const auto& workSpaceSize = wrw_invoke_params.workSpaceSize;
                float elapsed             = 0;
                float zero                = 0.f;

                if(workSpace == nullptr || workSpaceSize < required_workspace_size)
                    MIOPEN_THROW("Not enough workspace has been provided for "
                                 "ConvAsmImplicitGemmGTCDynamicWrwXdlops with fp16 and atomic "
                                 "add.");
                auto trans_input_buf = trans_input_size== 0 ?null_buf : handle.CreateSubBuffer(
                    workSpace, trans_input_offset, trans_input_size);
                auto trans_weight_buf = trans_weight_size==0 ? null_buf : handle.CreateSubBuffer(
                    workSpace, trans_weight_offset, trans_weight_size);
                auto trans_output_buf = trans_output_size ==0 ? null_buf : handle.CreateSubBuffer(
                    workSpace, trans_output_offset, trans_output_size);
                auto cast_buf = cast_size == 0 ? null_buf : handle.CreateSubBuffer(
                    workSpace, cast_offset, cast_size);

                SetTensor(handle, cast_desc, cast_buf.get(), &zero);
                if(handle.IsProfilingEnabled())
                    elapsed += handle.GetKernelTime();

                if(is_nchw)
                {
                    if(!trans_input_skippable){
                        auto& karg_input = opArgsTrans[trans_input_idx];
                        karg_input[0] = OpKernelArg(trans_input_buf.get());
                        karg_input[1] = OpKernelArg(tensors.x);
                        handle.Run(kernels[kID_trans_start + trans_input_idx])(karg_input);
                        if(handle.IsProfilingEnabled())
                            elapsed += handle.GetKernelTime();
                    }
                    if(!trans_output_skippable){
                        auto& karg_output = opArgsTrans[trans_output_idx];
                        karg_output[0] = OpKernelArg(trans_output_buf.get());
                        karg_output[1] = OpKernelArg(tensors.dy);
                        handle.Run(kernels[kID_trans_start + trans_output_idx])(karg_output);
                        if(handle.IsProfilingEnabled())
                            elapsed += handle.GetKernelTime();
                    }
                }

                opArgs[0] = (is_nchw && !trans_input_skippable) ? OpKernelArg(trans_input_buf.get()) : OpKernelArg(tensors.x);
                opArgs[1] = OpKernelArg(cast_buf.get());
                opArgs[2] = (is_nchw && !trans_output_skippable) ? OpKernelArg(trans_output_buf.get()) : OpKernelArg(tensors.dy);

                ker(opArgs);
                if(handle.IsProfilingEnabled())
                    elapsed += handle.GetKernelTime();

                CastTensor(handle,
                           &lowp_quant,
                           cast_desc,
                           cast_buf.get(),
                           tensors.dwDesc,
                           (is_nchw && !trans_weight_skippable) ? trans_weight_buf.get() :  tensors.dw,
                           0,
                           0);

                if(is_nchw && !trans_weight_skippable)
                {
                    auto& karg_weight = opArgsTrans[trans_weight_idx];
                    karg_weight[0]    = OpKernelArg(tensors.dw);
                    karg_weight[1]    = OpKernelArg(trans_weight_buf.get());
                    handle.Run(kernels[kID_trans_start + trans_weight_idx])(karg_weight);
                    if(handle.IsProfilingEnabled())
                        elapsed += handle.GetKernelTime();
                }

                if(handle.IsProfilingEnabled())
                    elapsed += handle.GetKernelTime();

                if(handle.IsProfilingEnabled())
                {
                    handle.ResetKernelTime();
                    handle.AccumKernelTime(elapsed);
                }
            };
        };
    }
    else
    {
        result.invoker_factory = [=](const std::vector<Kernel>& kernels) mutable {
            return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) mutable {
                decltype(auto) wrw_invoke_params =
                    primitive_parameters.CastTo<conv::WrWInvokeParams>();
                const auto& tensors = wrw_invoke_params.tensors;
                const auto ker        = handle.Run(
                    kernels[(isGfx90aFp16altSupport && wrw_invoke_params.gfx90aFp16alt) ? 1 : 0]);
                const auto& workSpace     = wrw_invoke_params.workSpace;
                float elapsed = 0;
                float zero    = 0.f;

                auto trans_input_buf = trans_input_size== 0 ?null_buf : handle.CreateSubBuffer(
                    workSpace, trans_input_offset, trans_input_size);
                auto trans_weight_buf = trans_weight_size==0 ? null_buf : handle.CreateSubBuffer(
                    workSpace, trans_weight_offset, trans_weight_size);
                auto trans_output_buf = trans_output_size ==0 ? null_buf : handle.CreateSubBuffer(
                    workSpace, trans_output_offset, trans_output_size);
                auto cast_buf = cast_size == 0 ? null_buf : handle.CreateSubBuffer(
                    workSpace, cast_offset, cast_size);

                opArgs[0] = (is_nchw && !trans_input_skippable) ? OpKernelArg(trans_input_buf.get()) : OpKernelArg(tensors.x);
                opArgs[1] = (is_nchw && !trans_weight_skippable)? OpKernelArg(trans_weight_buf.get()) : OpKernelArg(tensors.dw);
                opArgs[2] = (is_nchw && !trans_output_skippable) ? OpKernelArg(trans_output_buf.get()) : OpKernelArg(tensors.dy);

                SetTensor(handle, tensors.dwDesc, (is_nchw && !trans_weight_skippable) ? trans_weight_buf.get() : tensors.dw, &zero);
                if(handle.IsProfilingEnabled())
                    elapsed += handle.GetKernelTime();

                if(is_nchw)
                {
                    if(!trans_input_skippable){

                        auto& karg_input = opArgsTrans[trans_input_idx];
                        karg_input[0] = OpKernelArg(trans_input_buf.get());
                        karg_input[1] = OpKernelArg(tensors.x);
                        handle.Run(kernels[kID_trans_start + trans_input_idx])(karg_input);
                        if(handle.IsProfilingEnabled())
                            elapsed += handle.GetKernelTime();
                    }
                    if(!trans_output_skippable){

                        auto& karg_output = opArgsTrans[trans_output_idx];
                        karg_output[0] = OpKernelArg(trans_output_buf.get());
                        karg_output[1] = OpKernelArg(tensors.dy);
                        handle.Run(kernels[kID_trans_start + trans_output_idx])(karg_output);
                        if(handle.IsProfilingEnabled())
                            elapsed += handle.GetKernelTime();
                    }
                }

                ker(opArgs);
                if(handle.IsProfilingEnabled())
                    elapsed += handle.GetKernelTime();

                if(is_nchw && !trans_weight_skippable)
                {
                    auto& karg_weight = opArgsTrans[trans_weight_idx];
                    karg_weight[0]    = OpKernelArg(tensors.dw);
                    karg_weight[1]    = OpKernelArg(trans_weight_buf.get());
                    handle.Run(kernels[kID_trans_start + trans_weight_idx])(karg_weight);
                    if(handle.IsProfilingEnabled())
                        elapsed += handle.GetKernelTime();
                }

                if(handle.IsProfilingEnabled())
                {
                    handle.ResetKernelTime();
                    handle.AccumKernelTime(elapsed);
                }
            };
        };
    }

    return result;
}

} // namespace solver
} // namespace miopen
