/******************************************************************************* * * MIT License * * Copyright (c) 2020 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * *******************************************************************************/ #include #include "driver.hpp" #include "test.hpp" #include "verify.hpp" #include "get_handle.hpp" #include "tensor_holder.hpp" #include "random.hpp" #include #include #include #include #include #include #include #include #include #include #include "cpu_reduce_util.hpp" /// Not reproducible with ROCm 4.1 and 4.2. #define WORKAROUND_GPU_NUMERIC_ERROR \ (HIP_PACKAGE_VERSION_MAJOR == 3 && HIP_PACKAGE_VERSION_MINOR == 7) template struct verify_reduce_with_indices { miopen::ReduceTensorDescriptor reduce; tensor input; tensor output; tensor workspace; tensor indices; float alpha; float beta; miopenReduceTensorOp_t reduceOp; miopenDataType_t compTypeVal; miopenNanPropagation_t nanOpt; miopenReduceTensorIndices_t indicesOpt; miopenIndicesType_t indicesType; verify_reduce_with_indices( // NOLINT (hicpp-member-init) const miopen::ReduceTensorDescriptor& reduce_, const tensor& input_, const tensor& output_, const tensor& workspace_, const tensor& indices_, float alpha_, float beta_) { reduce = reduce_; input = input_; output = output_; workspace = workspace_; indices = indices_; alpha = alpha_; beta = beta_; reduceOp = reduce.reduceTensorOp_; compTypeVal = reduce.reduceTensorCompType_; nanOpt = reduce.reduceTensorNanOpt_; indicesOpt = reduce.reduceTensorIndices_; indicesType = reduce.reduceTensorIndicesType_; } tensor cpu() const { using reduce::convert_type; std::tuple, tensor> results; if(compTypeVal == miopenFloat) { if(std::is_same::value) results = cpuImpl(); else results = cpuImpl(); } else if(compTypeVal == miopenHalf) { if(std::is_same::value) results = cpuImpl(); else if(std::is_same::value) results = cpuImpl(); else results = cpuImpl(); } else if(compTypeVal == miopenDouble) results = cpuImpl(); if(toVerifyData) { const auto dimLengths = output.desc.GetLengths(); auto result_dataFloat = make_tensor(dimLengths); auto& result_dataT = std::get<0>(results); for(size_t i = 0; i < result_dataT.data.size(); i++) result_dataFloat.data[i] = convert_type(result_dataT.data[i]); return (result_dataFloat); } else { const auto dimLengths = indices.desc.GetLengths(); auto result_indicesFloat = make_tensor(dimLengths); auto& result_indices = std::get<1>(results); for(size_t i = 0; i < result_indices.data.size(); i++) result_indicesFloat.data[i] = static_cast(result_indices.data[i]); return (result_indicesFloat); }; }; tensor gpu() const { using reduce::convert_type; std::tuple, tensor> results; results = gpuImpl(); if(toVerifyData) { const auto dimLengths = output.desc.GetLengths(); auto result_dataFloat = make_tensor(dimLengths); tensor& result_dataT = std::get<0>(results); for(size_t i = 0; i < result_dataT.data.size(); i++) result_dataFloat.data[i] = convert_type(result_dataT.data[i]); return (result_dataFloat); } else { const auto dimLengths = indices.desc.GetLengths(); auto result_indicesFloat = make_tensor(dimLengths); tensor& result_indices = std::get<1>(results); for(size_t i = 0; i < result_indices.data.size(); i++) result_indicesFloat.data[i] = static_cast(result_indices.data[i]); return (result_indicesFloat); }; }; template std::tuple, tensor> cpuImpl() const { using reduce::binop_with_nan_check; using reduce::binop_with_nan_check2; using reduce::convert_type; using reduce::float_equal_one; using reduce::float_equal_zero; using reduce::PosUnaryOpFn; using reduce::PreUnaryOpFn; using reduce::ReduceOpFn2; using reduce::ReduceOpZeroVal; auto inLengths = input.desc.GetLengths(); auto outLengths = output.desc.GetLengths(); auto inStrides = input.desc.GetStrides(); auto outStrides = output.desc.GetStrides(); // replicate auto res = output; auto res_indices = indices; std::vector invariantLengths; std::vector toReduceLengths; std::vector invariantDims; std::vector toReduceDims; for(int i = 0; i < inLengths.size(); i++) if(inLengths[i] == outLengths[i]) invariantDims.push_back(i); else toReduceDims.push_back(i); invariantLengths.resize(invariantDims.size()); for(int i = 0; i < invariantDims.size(); i++) invariantLengths[i] = inLengths[invariantDims[i]]; toReduceLengths.resize(toReduceDims.size()); for(int i = 0; i < toReduceDims.size(); i++) toReduceLengths[i] = inLengths[toReduceDims[i]]; bool reduceAllDims = invariantDims.empty(); auto opReduce = ReduceOpFn2(reduceOp); std::size_t divider = std::accumulate( toReduceLengths.begin(), toReduceLengths.end(), std::size_t{1}, std::multiplies<>{}); auto PreUnaryOp = PreUnaryOpFn(reduceOp, divider); if(reduceAllDims) { std::vector> indexes_1; get_all_indexes(inLengths, 0, indexes_1); compType accuVal = ReduceOpZeroVal(reduceOp); int accuIndex = 0; // go through indexes of the invariant dimensions for(const auto& src_index : indexes_1) { auto src_offset = get_offset_from_index(inStrides, src_index); auto currVal = convert_type(input.data[src_offset]); // unary operation before reducing, only needed by AMAX. For MIN/MAX, nothing is // actually done PreUnaryOp(currVal); int currIndex = get_flatten_offset(inLengths, src_index); binop_with_nan_check2(nanOpt, opReduce, accuVal, currVal, accuIndex, currIndex); }; // scale the accumulated value if(!float_equal_one(alpha)) accuVal *= convert_type(alpha); // scale the prior dst value and add it to the accumulated value if(!float_equal_zero(beta)) { accuVal += convert_type(output.data[0]) * convert_type(beta); }; // store the reduced value to dst location res.data[0] = convert_type(accuVal); res_indices.data[0] = accuIndex; } else { std::vector> indexes_1, indexes_2; get_all_indexes(invariantLengths, 0, indexes_1); get_all_indexes(toReduceLengths, 0, indexes_2); // go through indexes of the invariant dimensions for(const auto& index_1 : indexes_1) { std::vector src_index; std::vector dst_index; src_index.resize(inLengths.size()); dst_index.resize(inLengths.size()); std::fill(dst_index.begin(), dst_index.end(), 0); for(int k = 0; k < invariantDims.size(); k++) dst_index[invariantDims[k]] = index_1[k]; auto dst_offset = get_offset_from_index(outStrides, dst_index); // generate the part of the index belonging to the invariant dims for(int k = 0; k < invariantDims.size(); k++) src_index[invariantDims[k]] = index_1[k]; compType accuVal = ReduceOpZeroVal(reduceOp); int accuIndex = 0; // go through indexes of the toReduce dimensions for(const auto& index_2 : indexes_2) { // generate the part of the index belonging to the toReduce dims for(int k = 0; k < toReduceDims.size(); k++) src_index[toReduceDims[k]] = index_2[k]; auto src_offset = get_offset_from_index(inStrides, src_index); auto currVal = convert_type(input.data[src_offset]); // unary operation before reducing, only needed by AMAX. For MIN/MAX, nothing is // actually done PreUnaryOp(currVal); auto currIndex = get_flatten_offset(toReduceLengths, index_2); binop_with_nan_check2(nanOpt, opReduce, accuVal, currVal, accuIndex, currIndex); }; // scale the accumulated value if(!float_equal_one(alpha)) accuVal *= convert_type(alpha); // scale the prior dst value and add it to the accumulated value if(!float_equal_zero(beta)) accuVal += convert_type(output.data[dst_offset]) * convert_type(beta); // store the reduced value to dst location res.data[dst_offset] = convert_type(accuVal); res_indices.data[dst_offset] = accuIndex; // store the index }; }; return (std::make_tuple(res, res_indices)); } std::tuple, tensor> gpuImpl() const { auto&& handle = get_handle(); auto input_dev = handle.Write(input.data); auto output_dev = handle.Write(output.data); // replicate auto res = output; auto res_indices = indices; auto indices_dev = handle.Write(indices.data); std::size_t ws_sizeInBytes = workspace.desc.GetElementSize() * sizeof(T); std::size_t indices_sizeInBytes = indices.desc.GetElementSize() * sizeof(int); const double alpha64 = alpha; const double beta64 = beta; const void* const alphaPtr = (std::is_same::value) ? static_cast(&alpha64) : static_cast(&alpha); const void* const betaPtr = (std::is_same::value) ? static_cast(&beta64) : static_cast(&beta); if(ws_sizeInBytes > 0) { auto workspace_dev = handle.Write(workspace.data); reduce.ReduceTensor(get_handle(), indices_dev.get(), indices_sizeInBytes, workspace_dev.get(), ws_sizeInBytes, alphaPtr, input.desc, input_dev.get(), betaPtr, output.desc, output_dev.get()); } else { reduce.ReduceTensor(get_handle(), indices_dev.get(), indices_sizeInBytes, nullptr, 0, alphaPtr, input.desc, input_dev.get(), betaPtr, output.desc, output_dev.get()); }; res.data = handle.Read(output_dev, res.data.size()); res_indices.data = handle.Read(indices_dev, res_indices.data.size()); return (std::make_tuple(res, res_indices)); } void fail(int) const { std::cout << "verify_reduce_with_indices failed" << std::endl; std::cout << "Input Tensor" << " " << input.desc.ToString() << std::endl; } }; template struct verify_reduce_no_indices { miopen::ReduceTensorDescriptor reduce; tensor input; tensor output; tensor workspace; float alpha; float beta; miopenReduceTensorOp_t reduceOp; miopenDataType_t compTypeVal; miopenNanPropagation_t nanOpt; verify_reduce_no_indices( // NOLINT (hicpp-member-init) const miopen::ReduceTensorDescriptor& reduce_, const tensor& input_, const tensor& output_, const tensor& workspace_, float alpha_, float beta_) { reduce = reduce_; input = input_; output = output_; workspace = workspace_; alpha = alpha_; beta = beta_; reduceOp = reduce.reduceTensorOp_; compTypeVal = reduce.reduceTensorCompType_; nanOpt = reduce.reduceTensorNanOpt_; } tensor cpu() { using reduce::convert_type; tensor result; if(compTypeVal == miopenFloat) { if(std::is_same::value) result = cpuImpl(); else result = cpuImpl(); } else if(compTypeVal == miopenHalf) { if(std::is_same::value) result = cpuImpl(); else if(std::is_same::value) result = cpuImpl(); else result = cpuImpl(); } else if(compTypeVal == miopenDouble) result = cpuImpl(); const auto dimLengths = output.desc.GetLengths(); auto result_dataFloat = make_tensor(dimLengths); for(size_t i = 0; i < result.data.size(); i++) result_dataFloat.data[i] = convert_type(result.data[i]); return (result_dataFloat); }; template tensor cpuImpl() const { using reduce::binop_with_nan_check; using reduce::binop_with_nan_check2; using reduce::convert_type; using reduce::float_equal_one; using reduce::float_equal_zero; using reduce::PosUnaryOpFn; using reduce::PreUnaryOpFn; using reduce::ReduceOpFn; using reduce::ReduceOpZeroVal; auto inLengths = input.desc.GetLengths(); auto outLengths = output.desc.GetLengths(); auto inStrides = input.desc.GetStrides(); auto outStrides = output.desc.GetStrides(); // replicate auto res = output; std::vector invariantLengths; std::vector toReduceLengths; std::vector invariantDims; std::vector toReduceDims; for(int i = 0; i < inLengths.size(); i++) if(inLengths[i] == outLengths[i]) invariantDims.push_back(i); else toReduceDims.push_back(i); invariantLengths.resize(invariantDims.size()); for(int i = 0; i < invariantDims.size(); i++) invariantLengths[i] = inLengths[invariantDims[i]]; toReduceLengths.resize(toReduceDims.size()); for(int i = 0; i < toReduceDims.size(); i++) toReduceLengths[i] = inLengths[toReduceDims[i]]; bool reduceAllDims = invariantDims.empty(); auto opReduce = ReduceOpFn(reduceOp); std::size_t divider = std::accumulate( toReduceLengths.begin(), toReduceLengths.end(), std::size_t{1}, std::multiplies<>{}); auto PreUnaryOp = PreUnaryOpFn(reduceOp, divider); auto PosUnaryOp = PosUnaryOpFn(reduceOp, divider); if(reduceAllDims) { std::vector> indexes_1; get_all_indexes(inLengths, 0, indexes_1); compType accuVal = ReduceOpZeroVal(reduceOp); // go through indexes of the invariant dimensions for(const auto& src_index : indexes_1) { auto src_offset = get_offset_from_index(inStrides, src_index); auto currVal = convert_type(input.data[src_offset]); PreUnaryOp(currVal); binop_with_nan_check(nanOpt, opReduce, accuVal, currVal); }; PosUnaryOp(accuVal); // scale the accumulated value if(!float_equal_one(alpha)) accuVal *= convert_type(alpha); // scale the prior dst value and add it to the accumulated value if(!float_equal_zero(beta)) accuVal += convert_type(output.data[0]) * convert_type(beta); // store the reduced value to dst location res.data[0] = convert_type(accuVal); } else { std::vector> indexes_1, indexes_2; get_all_indexes(invariantLengths, 0, indexes_1); get_all_indexes(toReduceLengths, 0, indexes_2); // go through indexes of the invariant dimensions for(const auto& index_1 : indexes_1) { std::vector src_index; std::vector dst_index; src_index.resize(inLengths.size()); dst_index.resize(inLengths.size()); std::fill(dst_index.begin(), dst_index.end(), 0); for(int k = 0; k < invariantDims.size(); k++) dst_index[invariantDims[k]] = index_1[k]; auto dst_offset = get_offset_from_index(outStrides, dst_index); // generate the part of the index belonging to the invariant dims for(int k = 0; k < invariantDims.size(); k++) src_index[invariantDims[k]] = index_1[k]; compType accuVal = ReduceOpZeroVal(reduceOp); // go through indexes of the toReduce dimensions for(const auto& index_2 : indexes_2) { // generate the part of the index belonging to the toReduce dims for(int k = 0; k < toReduceDims.size(); k++) src_index[toReduceDims[k]] = index_2[k]; auto src_offset = get_offset_from_index(inStrides, src_index); auto currVal = convert_type(input.data[src_offset]); PreUnaryOp(currVal); binop_with_nan_check(nanOpt, opReduce, accuVal, currVal); }; PosUnaryOp(accuVal); // scale the accumulated value if(!float_equal_one(alpha)) accuVal *= convert_type(alpha); // scale the prior dst value and add it to the accumulated value if(!float_equal_zero(beta)) accuVal += convert_type(output.data[dst_offset]) * convert_type(beta); // store the reduced value to dst location res.data[dst_offset] = convert_type(accuVal); }; }; return (res); } tensor gpu() const { using reduce::convert_type; auto result = gpuImpl(); const auto dimLengths = output.desc.GetLengths(); auto result_dataFloat = make_tensor(dimLengths); for(size_t i = 0; i < result.data.size(); i++) result_dataFloat.data[i] = convert_type(result.data[i]); return (result_dataFloat); }; tensor gpuImpl() const { auto&& handle = get_handle(); auto input_dev = handle.Write(input.data); auto output_dev = handle.Write(output.data); // replicate auto res = output; std::size_t ws_sizeInBytes = workspace.desc.GetElementSize() * sizeof(T); const double alpha64 = alpha; const double beta64 = beta; const void* const alphaPtr = (std::is_same::value) ? static_cast(&alpha64) : static_cast(&alpha); const void* const betaPtr = (std::is_same::value) ? static_cast(&beta64) : static_cast(&beta); if(ws_sizeInBytes > 0) { auto workspace_dev = handle.Write(workspace.data); reduce.ReduceTensor(get_handle(), nullptr, 0, workspace_dev.get(), ws_sizeInBytes, alphaPtr, input.desc, input_dev.get(), betaPtr, output.desc, output_dev.get()); } else { reduce.ReduceTensor(get_handle(), nullptr, 0, nullptr, 0, alphaPtr, input.desc, input_dev.get(), betaPtr, output.desc, output_dev.get()); }; res.data = handle.Read(output_dev, res.data.size()); return (res); } void fail(int) const { std::cout << "verify_reduce_no_indices failed" << std::endl; std::cout << "Input Tensor" << " " << input.desc.ToString() << std::endl; } }; template struct reduce_driver : test_driver { int reduceOp = 0; // miopenReduceTensorOp_t reduceOp; int compTypeVal = 1; // miopenDataType_t compTypeVal; int nanOpt = 0; // miopenNanPropagation_t nanOpt; int indicesOpt = 0; // miopenReduceTensorIndices_t indicesOpt; miopenIndicesType_t indicesType = MIOPEN_32BIT_INDICES; std::vector inLengths; // the lengths of the input tensor's dimensions std::vector toReduceDims; // the indexes of the dimensions to be reduced in the input tensor std::vector scales; float alpha = 1.0f; float beta = 0.0f; std::vector> get_tensor_lengths() { if(std::is_same::value) return { {4, 3, 60, 50}, }; else return { {64, 3, 280, 81}, }; } std::vector> get_toreduce_dims() { std::vector> tensor_dims = { {0}, {1}, {2}, {3}, {0, 1}, {0, 3}, {0, 2}, {2, 3}, {0, 1, 3}, {1, 2, 3}, {0, 1, 2, 3}}; return tensor_dims; } reduce_driver() { add(inLengths, "D", generate_data(get_tensor_lengths())); add(toReduceDims, "R", generate_data(get_toreduce_dims())); add(reduceOp, "ReduceOp", generate_data({0, 1, 4, 5, 6, 7})); add(compTypeVal, "CompType", generate_data({1})); add(nanOpt, "N", generate_data({0, 1})); add(indicesOpt, "I", generate_data({0, 1})); add(scales, "scales", generate_data({{1.0f, 0.0f}, {0.5f, 0.5f}})); auto&& handle = get_handle(); handle.EnableProfiling(); } void run() { using reduce::convert_type; if(std::is_same::value) compTypeVal = static_cast(miopenDouble); if(std::is_same::value) { if(reduceOp == MIOPEN_REDUCE_TENSOR_MIN || reduceOp == MIOPEN_REDUCE_TENSOR_MAX || reduceOp == MIOPEN_REDUCE_TENSOR_AMAX) compTypeVal = static_cast(miopenHalf); // let compType be same as the data type else compTypeVal = static_cast(miopenFloat); } #if WORKAROUND_GPU_NUMERIC_ERROR if(std::is_same::value) { if(inLengths == std::vector{64, 3, 280, 81} && toReduceDims == std::vector{0, 1, 2, 3} && (reduceOp == 3 || reduceOp == 4) && indicesOpt == 1) { std::cout << "Workaround: Skipping the test." << std::endl; return; }; } #endif miopen::ReduceTensorDescriptor reduceDesc( static_cast(reduceOp), static_cast(compTypeVal), static_cast(nanOpt), static_cast(indicesOpt), indicesType); alpha = scales[0]; beta = scales[1]; // The test is ignored if (alpha, beta) is not (1.0f, 0.0f) and reduceOp is not Add/MUL/AVG if(reduceOp != MIOPEN_REDUCE_TENSOR_ADD && reduceOp != MIOPEN_REDUCE_TENSOR_MUL && reduceOp != MIOPEN_REDUCE_TENSOR_AVG && alpha != 1.0f && beta != 0.0f) return; // The test is ignored if indices are requested but the reduceOp is neither MIN nor MAX if(indicesOpt != MIOPEN_REDUCE_TENSOR_NO_INDICES && reduceOp != MIOPEN_REDUCE_TENSOR_MIN && reduceOp != MIOPEN_REDUCE_TENSOR_MAX && reduceOp != MIOPEN_REDUCE_TENSOR_AMAX) return; auto outLengths = this->inLengths; assert(toReduceDims.size() <= outLengths.size()); for(int i = 0; i < toReduceDims.size(); i++) assert(toReduceDims[i] < inLengths.size()); // set the lengths of the dimensions to be reduced to 1 to represent the output Tensor for(const int& toReduceDim : toReduceDims) outLengths[toReduceDim] = static_cast(1); unsigned long max_value; if(reduceOp == MIOPEN_REDUCE_TENSOR_MUL) max_value = miopen_type{} == miopenHalf ? 41 : miopen_type{} == miopenInt8 ? 127 : 111; else if(reduceOp == MIOPEN_REDUCE_TENSOR_NORM1 || reduceOp == MIOPEN_REDUCE_TENSOR_NORM2) max_value = 3; else max_value = miopen_type{} == miopenHalf ? 13 : miopen_type{} == miopenInt8 ? 127 : 999; // default data gneration (used by MIN/MAX) auto gen_value = [&](auto... is) { return (tensor_elem_gen_integer{max_value}(is...) * tensor_elem_gen_checkboard_sign{}(is...)); }; // data generation used by ADD/AVG, data is distributed around 1.0 rather than 0.0, very low // probability to get a reduced result of zero-value auto gen_value_1 = [&](auto... is) { auto rand_value = tensor_elem_gen_integer{max_value}(is...); auto sign_value = tensor_elem_gen_checkboard_sign{}(is...); return (sign_value * rand_value / max_value + 0.01); }; // Special data generation for MUL, to avoid all-zero and large accumulative error in the // reduced result auto gen_value_2 = [&](auto... is) { auto rand_value = tensor_elem_gen_integer{max_value}(is...); auto sign_value = tensor_elem_gen_checkboard_sign{}(is...); return sign_value > 0.0 ? (rand_value + max_value) / (rand_value + max_value + 1) : (rand_value + max_value + 1) / (rand_value + max_value); }; // Special data generation for NORM1 and NORM2 using a space of limitless number of values. // This method is slower due to the use of GET_RAND(), it is usually used for manual testing auto gen_value_3 = [&](auto... is) { auto rand_upper = tensor_elem_gen_integer{max_value}(is...); auto sign_value = tensor_elem_gen_checkboard_sign{}(is...); double rand_ratio = static_cast(GET_RAND() / (static_cast(RAND_MAX))); return rand_upper * sign_value * rand_ratio; }; // Special data generation for AMAX, no zero value used auto gen_value_4 = [&](auto... is) { auto rand_value = tensor_elem_gen_integer{max_value}(is...); auto sign_value = tensor_elem_gen_checkboard_sign{}(is...); return sign_value > 0.0 ? (rand_value + 0.5) : (-1.0 * rand_value - 0.5); }; // default tolerance (refer to driver.hpp) this->tolerance = 80; if(reduceOp == MIOPEN_REDUCE_TENSOR_ADD || reduceOp == MIOPEN_REDUCE_TENSOR_AVG) this->tolerance = 80 * 10; if(reduceOp == MIOPEN_REDUCE_TENSOR_MUL) this->tolerance = 80 * 300; else if(reduceOp == MIOPEN_REDUCE_TENSOR_NORM1 || reduceOp == MIOPEN_REDUCE_TENSOR_NORM2) { if(toReduceDims.size() == 4) this->tolerance = 80 * 100; else this->tolerance = 80 * 10; }; if(std::is_same::value) this->tolerance *= this->tolerance * 10.0; tensor inputTensor; switch(reduceOp) { case MIOPEN_REDUCE_TENSOR_ADD: case MIOPEN_REDUCE_TENSOR_AVG: inputTensor = tensor{this->inLengths}.generate(gen_value_1); break; case MIOPEN_REDUCE_TENSOR_MUL: inputTensor = tensor{this->inLengths}.generate(gen_value_2); break; case MIOPEN_REDUCE_TENSOR_NORM1: case MIOPEN_REDUCE_TENSOR_NORM2: inputTensor = tensor{this->inLengths}.generate(gen_value_3); break; case MIOPEN_REDUCE_TENSOR_AMAX: inputTensor = tensor{this->inLengths}.generate(gen_value_4); break; default: inputTensor = tensor{this->inLengths}.generate(gen_value); }; auto outputTensor = tensor{outLengths}; std::fill(outputTensor.begin(), outputTensor.end(), convert_type(0.0f)); auto indices_nelem = reduceDesc.GetIndicesSize(inputTensor.desc, outputTensor.desc) / sizeof(int); auto ws_sizeInBytes = reduceDesc.GetWorkspaceSize(get_handle(), inputTensor.desc, outputTensor.desc); auto workspace_nelem = (indices_nelem == 0) ? ws_sizeInBytes / sizeof(T) : (ws_sizeInBytes + sizeof(T) - 1) / sizeof(T); std::vector wsLengths = {static_cast(workspace_nelem), 1}; auto workspaceTensor = tensor{wsLengths}; std::fill(workspaceTensor.begin(), workspaceTensor.end(), convert_type(0.0f)); if(indices_nelem > 0) { std::vector indicesLengths = {static_cast(indices_nelem), 1}; auto indicesTensor = tensor{indicesLengths}; std::fill(indicesTensor.begin(), indicesTensor.end(), 1); verify(verify_reduce_with_indices( reduceDesc, inputTensor, outputTensor, workspaceTensor, indicesTensor, 1.0f, 0.0f)); verify_equals(verify_reduce_with_indices( reduceDesc, inputTensor, outputTensor, workspaceTensor, indicesTensor, 1.0f, 0.0f)); } else { verify(verify_reduce_no_indices( reduceDesc, inputTensor, outputTensor, workspaceTensor, alpha, beta)); }; }; }; int main(int argc, const char* argv[]) { std::vector as(argv + 1, argv + argc); bool test_half = false; bool test_double = false; test_half = std::any_of( as.begin(), as.end(), [](const std::string& elem) { return (elem == "--half"); }); test_double = std::any_of( as.begin(), as.end(), [](const std::string& elem) { return (elem == "--double"); }); if(test_half) test_drive>(argc, argv); else if(test_double) test_drive>(argc, argv); else test_drive>(argc, argv); };