// MIT License // // Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_test_header.hpp" #include "hipcub/warp/warp_load.hpp" template< class T, unsigned WarpSize, ::hipcub::WarpLoadAlgorithm Algorithm > struct Params { using type = T; static constexpr unsigned warp_size = WarpSize; static constexpr ::hipcub::WarpLoadAlgorithm algorithm = Algorithm; }; template class HipcubWarpLoadTest : public ::testing::Test { public: using params = Params; }; using HipcubWarpLoadTestParams = ::testing::Types< Params, Params, Params, Params, Params, Params, Params, Params, Params, Params, Params, Params, Params, Params, Params, Params >; template< class T, unsigned BlockSize, unsigned ItemsPerThread, unsigned LogicalWarpSize, ::hipcub::WarpLoadAlgorithm Algorithm > __global__ __launch_bounds__(BlockSize) void warp_load_kernel( T* d_input, T* d_output) { using WarpLoadT = ::hipcub::WarpLoad< T, ItemsPerThread, Algorithm, ::test_utils::DeviceSelectWarpSize::value >; constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; constexpr int tile_size = ItemsPerThread * LogicalWarpSize; const unsigned warp_id = hipThreadIdx_x / LogicalWarpSize; __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; T thread_data[ItemsPerThread]; WarpLoadT(temp_storage[warp_id]).Load(d_input + warp_id * tile_size, thread_data); for (unsigned i = 0; i < ItemsPerThread; ++i) { d_output[hipThreadIdx_x * ItemsPerThread + i] = thread_data[i]; } } template< class T, unsigned BlockSize, unsigned ItemsPerThread, unsigned LogicalWarpSize, ::hipcub::WarpLoadAlgorithm Algorithm > __global__ __launch_bounds__(BlockSize) void warp_load_guarded_kernel( T* d_input, T* d_output, int valid_items, T oob_default) { using WarpLoadT = ::hipcub::WarpLoad< T, ItemsPerThread, Algorithm, ::test_utils::DeviceSelectWarpSize::value >; constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; constexpr int tile_size = ItemsPerThread * LogicalWarpSize; const unsigned warp_id = hipThreadIdx_x / LogicalWarpSize; __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; T thread_data[ItemsPerThread]; WarpLoadT(temp_storage[warp_id]).Load( d_input + warp_id * tile_size, thread_data, valid_items, oob_default ); for (unsigned i = 0; i < ItemsPerThread; ++i) { d_output[hipThreadIdx_x * ItemsPerThread + i] = thread_data[i]; } } template std::vector stripe_vector(const std::vector& v, const size_t warp_size, const size_t items_per_thread) { const size_t warp_items = warp_size * items_per_thread; std::vector striped(v.size()); for (size_t i = 0; i < v.size(); ++i) { const size_t warp_idx = i % warp_items; const size_t other_warp_idx = (warp_idx % items_per_thread) * warp_size + (warp_idx / items_per_thread); const size_t other_idx = other_warp_idx + warp_items * (i / warp_items); striped[i] = v[other_idx]; } return striped; } TYPED_TEST_SUITE(HipcubWarpLoadTest, HipcubWarpLoadTestParams); TYPED_TEST(HipcubWarpLoadTest, WarpLoad) { using T = typename TestFixture::params::type; constexpr unsigned warp_size = TestFixture::params::warp_size; constexpr ::hipcub::WarpLoadAlgorithm algorithm = TestFixture::params::algorithm; constexpr unsigned items_per_thread = 4; constexpr unsigned block_size = 1024; constexpr unsigned items_count = items_per_thread * block_size; SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size); std::vector input(items_count); std::iota(input.begin(), input.end(), static_cast(0)); T* d_input{}; HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, items_count * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), items_count * sizeof(T), hipMemcpyHostToDevice)); T* d_output{}; HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, items_count * sizeof(T))); hipLaunchKernelGGL( HIP_KERNEL_NAME( warp_load_kernel< T, block_size, items_per_thread, warp_size, algorithm > ), dim3(1), dim3(block_size), 0, 0, d_input, d_output ); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); std::vector output(items_count); HIP_CHECK(hipMemcpy(output.data(), d_output, items_count * sizeof(T), hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); auto expected = input; if (algorithm == ::hipcub::WarpLoadAlgorithm::WARP_LOAD_STRIPED) { expected = stripe_vector(input, warp_size, items_per_thread); } ASSERT_EQ(expected, output); } TYPED_TEST(HipcubWarpLoadTest, WarpLoadGuarded) { using T = typename TestFixture::params::type; constexpr unsigned warp_size = TestFixture::params::warp_size; constexpr ::hipcub::WarpLoadAlgorithm algorithm = TestFixture::params::algorithm; constexpr unsigned items_per_thread = 4; constexpr unsigned block_size = 1024; constexpr unsigned items_count = items_per_thread * block_size; constexpr int valid_items = warp_size / 4; constexpr T oob_default = std::numeric_limits::max(); SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size); std::vector input(items_count); std::iota(input.begin(), input.end(), static_cast(0)); T* d_input{}; HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, items_count * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), items_count * sizeof(T), hipMemcpyHostToDevice)); T* d_output{}; HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, items_count * sizeof(T))); hipLaunchKernelGGL( HIP_KERNEL_NAME( warp_load_guarded_kernel< T, block_size, items_per_thread, warp_size, algorithm > ), dim3(1), dim3(block_size), 0, 0, d_input, d_output, valid_items, oob_default ); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); std::vector output(items_count); HIP_CHECK(hipMemcpy(output.data(), d_output, items_count * sizeof(T), hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); auto expected = input; for (size_t warp_idx = 0; warp_idx < block_size / warp_size; ++warp_idx) { auto segment_begin = std::next(expected.begin(), warp_idx * warp_size * items_per_thread); auto segment_end = std::next(expected.begin(), (warp_idx + 1) * warp_size * items_per_thread); std::fill(std::next(segment_begin, valid_items), segment_end, oob_default); } if (algorithm == ::hipcub::WarpLoadAlgorithm::WARP_LOAD_STRIPED) { expected = stripe_vector(expected, warp_size, items_per_thread); } ASSERT_EQ(expected, output); }