// MIT License // // Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_test_header.hpp" #include "hipcub/warp/warp_store.hpp" template< class T, unsigned WarpSize, ::hipcub::WarpStoreAlgorithm Algorithm > struct Params { using type = T; static constexpr unsigned warp_size = WarpSize; static constexpr ::hipcub::WarpStoreAlgorithm algorithm = Algorithm; }; template class HipcubWarpStoreTest : public ::testing::Test { public: using params = Params; }; using HipcubWarpStoreTestParams = ::testing::Types< Params, Params, Params, Params, Params, Params, Params, Params, Params, Params, Params, Params, Params, Params, Params, Params >; template< class T, unsigned BlockSize, unsigned ItemsPerThread, unsigned LogicalWarpSize, ::hipcub::WarpStoreAlgorithm Algorithm > __global__ __launch_bounds__(BlockSize) void warp_store_kernel( T* d_input, T* d_output) { T thread_data[ItemsPerThread]; for (unsigned i = 0; i < ItemsPerThread; ++i) { thread_data[i] = d_input[hipThreadIdx_x * ItemsPerThread + i]; } using WarpStoreT = ::hipcub::WarpStore< T, ItemsPerThread, Algorithm, ::test_utils::DeviceSelectWarpSize::value >; constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; constexpr int tile_size = ItemsPerThread * LogicalWarpSize; const unsigned warp_id = hipThreadIdx_x / LogicalWarpSize; __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; WarpStoreT(temp_storage[warp_id]).Store(d_output + warp_id * tile_size, thread_data); } template< class T, unsigned BlockSize, unsigned ItemsPerThread, unsigned LogicalWarpSize, ::hipcub::WarpStoreAlgorithm Algorithm > __global__ __launch_bounds__(BlockSize) void warp_store_guarded_kernel( T* d_input, T* d_output, int valid_items) { T thread_data[ItemsPerThread]; for (unsigned i = 0; i < ItemsPerThread; ++i) { thread_data[i] = d_input[hipThreadIdx_x * ItemsPerThread + i]; } using WarpStoreT = ::hipcub::WarpStore< T, ItemsPerThread, Algorithm, ::test_utils::DeviceSelectWarpSize::value >; constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; constexpr int tile_size = ItemsPerThread * LogicalWarpSize; const unsigned warp_id = hipThreadIdx_x / LogicalWarpSize; __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; WarpStoreT(temp_storage[warp_id]).Store( d_output + warp_id * tile_size, thread_data, valid_items ); } template std::vector stripe_vector(const std::vector& v, const size_t warp_size, const size_t items_per_thread) { const size_t period = warp_size * items_per_thread; std::vector striped(v.size()); for (size_t i = 0; i < v.size(); ++i) { const size_t i_base = i % period; const size_t other_idx_base = ((items_per_thread * i_base) % period) + i_base / warp_size; const size_t other_idx = other_idx_base + period * (i / period); striped[i] = v[other_idx]; } return striped; } TYPED_TEST_SUITE(HipcubWarpStoreTest, HipcubWarpStoreTestParams); TYPED_TEST(HipcubWarpStoreTest, WarpStore) { using T = typename TestFixture::params::type; constexpr unsigned warp_size = TestFixture::params::warp_size; constexpr ::hipcub::WarpStoreAlgorithm algorithm = TestFixture::params::algorithm; constexpr unsigned items_per_thread = 4; constexpr unsigned block_size = 1024; constexpr unsigned items_count = items_per_thread * block_size; SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size); std::vector input(items_count); std::iota(input.begin(), input.end(), static_cast(0)); T* d_input{}; HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, items_count * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), items_count * sizeof(T), hipMemcpyHostToDevice)); T* d_output{}; HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, items_count * sizeof(T))); hipLaunchKernelGGL( HIP_KERNEL_NAME( warp_store_kernel< T, block_size, items_per_thread, warp_size, algorithm > ), dim3(1), dim3(block_size), 0, 0, d_input, d_output ); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); std::vector output(items_count); HIP_CHECK(hipMemcpy(output.data(), d_output, items_count * sizeof(T), hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); auto expected = input; if (algorithm == ::hipcub::WarpStoreAlgorithm::WARP_STORE_STRIPED) { expected = stripe_vector(input, warp_size, items_per_thread); } ASSERT_EQ(expected, output); } TYPED_TEST(HipcubWarpStoreTest, WarpStoreGuarded) { using T = typename TestFixture::params::type; constexpr unsigned warp_size = TestFixture::params::warp_size; constexpr ::hipcub::WarpStoreAlgorithm algorithm = TestFixture::params::algorithm; constexpr unsigned items_per_thread = 4; constexpr unsigned block_size = 1024; constexpr unsigned items_count = items_per_thread * block_size; constexpr int valid_items = warp_size / 4; SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size); std::vector input(items_count); std::iota(input.begin(), input.end(), static_cast(0)); T* d_input{}; HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, items_count * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), items_count * sizeof(T), hipMemcpyHostToDevice)); T* d_output{}; HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, items_count * sizeof(T))); HIP_CHECK(hipMemset(d_output, 0, items_count * sizeof(T))); hipLaunchKernelGGL( HIP_KERNEL_NAME( warp_store_guarded_kernel< T, block_size, items_per_thread, warp_size, algorithm > ), dim3(1), dim3(block_size), 0, 0, d_input, d_output, valid_items ); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); std::vector output(items_count); HIP_CHECK(hipMemcpy(output.data(), d_output, items_count * sizeof(T), hipMemcpyDeviceToHost)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); auto expected = input; if (algorithm == ::hipcub::WarpStoreAlgorithm::WARP_STORE_STRIPED) { expected = stripe_vector(expected, warp_size, items_per_thread); } for (size_t warp_idx = 0; warp_idx < block_size / warp_size; ++warp_idx) { auto segment_begin = std::next(expected.begin(), warp_idx * warp_size * items_per_thread); auto segment_end = std::next(expected.begin(), (warp_idx + 1) * warp_size * items_per_thread); std::fill(std::next(segment_begin, valid_items), segment_end, static_cast(0)); } ASSERT_EQ(expected, output); }