// RUN: %hc %s -o %t.out && %t.out //---------------------------------------------------------------------------- // File: Reduction.cpp // // Implements several different versions of reduction algorithm in C++ AMP. // Each consequent version demonstrates optimization techniques to speed up // the execution. All of the algorithms are tailored to 4-byte wide data and // may require tuning when different element type is to be used. // The content of the GPU-side data structures is lost during the algorithm // processing. // Note: these implementations are for demo purposes only and not recommended // to use in production code. //---------------------------------------------------------------------------- #define NOMINMAX #include #include #include #include #include using namespace hc; //---------------------------------------------------------------------------- // Helper macro to tell whether the argument is a positive integer // power of two. //---------------------------------------------------------------------------- #define IS_POWER_OF_2(arg) (arg > 1 && (arg & (arg - 1)) == 0) //---------------------------------------------------------------------------- // Helper function checking the common preconditions for tiled algorithms. // If the conditions are not met, the implementations will fall back to CPU // reduction after some number of iterations using parallel_for_each. // As it may be unexpected, a warning message will be issued. // Addressing these limitations is left as an exercise for the reader. //---------------------------------------------------------------------------- inline bool check_tiled_precondition(unsigned tile_size, unsigned element_count) { while ((element_count % tile_size) == 0) { element_count /= tile_size; } return element_count < tile_size; } //---------------------------------------------------------------------------- // This is a simple sequential implementation. //---------------------------------------------------------------------------- float sequential_reduction(const std::vector& source) { return std::accumulate(source.begin(), source.end(), 0.f); } //---------------------------------------------------------------------------- // This is an implementation of the reduction algorithm which uses tiling and // the shared memory. //---------------------------------------------------------------------------- template float reduction_tiled_1(const std::vector& source) { static_assert(IS_POWER_OF_2(_tile_size), "Tile size must be a positive integer power of two!"); assert(source.size() <= UINT_MAX); unsigned element_count = static_cast(source.size()); assert(element_count != 0); // Cannot reduce an empty sequence. if (!check_tiled_precondition(_tile_size, element_count)) { std::cout << "Warning, reduction_tiled_1 is not designed for the current problem size." << std::endl; } // Using arrays as temporary memory. array arr_1(element_count, source.begin()); array arr_2((element_count / _tile_size) ? (element_count / _tile_size) : 1); // array_views may be swapped after each iteration. array_view av_src(arr_1); array_view av_dst(arr_2); av_dst.discard_data(); // Reduce using parallel_for_each as long as the sequence length // is evenly divisable to the number of threads in the tile. while ((element_count % _tile_size) == 0) { parallel_for_each(extent<1>(element_count).tile(_tile_size), [=] (tiled_index<1> tidx) [[hc]] [[hc_flat_workgroup_size(_tile_size)]] { // Use tile_static as a scratchpad memory. tile_static float tile_data[_tile_size]; unsigned local_idx = tidx.local[0]; tile_data[local_idx] = av_src[tidx.global]; tidx.barrier.wait(); // Reduce within a tile using multiple threads. for(unsigned s = 1; s < _tile_size; s *= 2) { if (local_idx % (2 * s) == 0) { tile_data[local_idx] += tile_data[local_idx + s]; } tidx.barrier.wait(); } // Store the tile result in the global memory. if (local_idx == 0) { av_dst[tidx.tile] = tile_data[0]; } }); // Update the sequence length, swap source with destination. element_count /= _tile_size; std::swap(av_src, av_dst); av_dst.discard_data(); } // Perform any remaining reduction on the CPU. std::vector result(element_count); copy(av_src.section(0, element_count), result.begin()); return std::accumulate(result.begin(), result.end(), 0.f); } //---------------------------------------------------------------------------- // This is a version with a better resource utilization, as only a group // of consequtive threads become active - minimizing the divergence within // the scheduling units. //---------------------------------------------------------------------------- template float reduction_tiled_2(const std::vector& source) { static_assert(IS_POWER_OF_2(_tile_size), "Tile size must be a positive integer power of two!"); assert(source.size() <= UINT_MAX); unsigned element_count = static_cast(source.size()); assert(element_count != 0); // Cannot reduce an empty sequence. if (!check_tiled_precondition(_tile_size, element_count)) { std::cout << "Warning, reduction_tiled_2 is not designed for the current problem size." << std::endl; } // Using arrays as temporary memory. array arr_1(element_count, source.begin()); array arr_2((element_count / _tile_size) ? (element_count / _tile_size) : 1); // array_views may be swapped after each iteration. array_view av_src(arr_1); array_view av_dst(arr_2); av_dst.discard_data(); // Reduce using parallel_for_each as long as the sequence length // is evenly divisable to the number of threads in the tile. while ((element_count % _tile_size) == 0) { parallel_for_each(extent<1>(element_count).tile(_tile_size), [=] (tiled_index<1> tidx) [[hc]] [[hc_flat_workgroup_size(_tile_size)]] { // Use tile_static as a scratchpad memory. tile_static float tile_data[_tile_size]; unsigned local_idx = tidx.local[0]; tile_data[local_idx] = av_src[tidx.global]; tidx.barrier.wait(); // Reduce within a tile using multiple threads. for(unsigned s = 1; s < _tile_size; s *= 2) { unsigned index = 2 * s * local_idx; if (index < _tile_size) { tile_data[index] += tile_data[index + s]; } tidx.barrier.wait(); } // Store the tile result in the global memory. if (local_idx == 0) { av_dst[tidx.tile] = tile_data[0]; } }); // Update the sequence length, swap source with destination. element_count /= _tile_size; std::swap(av_src, av_dst); av_dst.discard_data(); } // Perform any remaining reduction on the CPU. std::vector result(element_count); copy(av_src.section(0, element_count), result.begin()); return std::accumulate(result.begin(), result.end(), 0.f); } //---------------------------------------------------------------------------- // This is a version without bank conflicts issue. //---------------------------------------------------------------------------- template float reduction_tiled_3(const std::vector& source) { static_assert(IS_POWER_OF_2(_tile_size), "Tile size must be a positive integer power of two!"); assert(source.size() <= UINT_MAX); unsigned element_count = static_cast(source.size()); assert(element_count != 0); // Cannot reduce an empty sequence. if (!check_tiled_precondition(_tile_size, element_count)) { std::cout << "Warning, reduction_tiled_3 is not designed for the current problem size." << std::endl; } // Using arrays as temporary memory. array arr_1(element_count, source.begin()); array arr_2((element_count / _tile_size) ? (element_count / _tile_size) : 1); // array_views may be swapped after each iteration. array_view av_src(arr_1); array_view av_dst(arr_2); av_dst.discard_data(); // Reduce using parallel_for_each as long as the sequence length // is evenly divisable to the number of threads in the tile. while ((element_count % _tile_size) == 0) { parallel_for_each(extent<1>(element_count).tile(_tile_size), [=] (tiled_index<1> tidx) [[hc]] [[hc_flat_workgroup_size(_tile_size)]] { // Use tile_static as a scratchpad memory. tile_static float tile_data[_tile_size]; unsigned local_idx = tidx.local[0]; tile_data[local_idx] = av_src[tidx.global]; tidx.barrier.wait(); // Reduce within a tile using multiple threads. for(unsigned s = _tile_size / 2; s > 0; s /= 2) { if (local_idx < s) { tile_data[local_idx] += tile_data[local_idx + s]; } tidx.barrier.wait(); } // Store the tile result in the global memory. if (local_idx == 0) { av_dst[tidx.tile] = tile_data[0]; } }); // Update the sequence length, swap source with destination. element_count /= _tile_size; std::swap(av_src, av_dst); av_dst.discard_data(); } // Perform any remaining reduction on the CPU. std::vector result(element_count); copy(av_src.section(0, element_count), result.begin()); return std::accumulate(result.begin(), result.end(), 0.f); } //---------------------------------------------------------------------------- // This is a version with a reduced number of stalled threads in the first // iteration. //---------------------------------------------------------------------------- template float reduction_tiled_4(const std::vector& source) { static_assert(IS_POWER_OF_2(_tile_size), "Tile size must be a positive integer power of two!"); assert(source.size() <= UINT_MAX); unsigned element_count = static_cast(source.size()); assert(element_count != 0); // Cannot reduce an empty sequence. if (!check_tiled_precondition(_tile_size * 2, element_count)) { std::cout << "Warning, reduction_tiled_4 is not designed for the current problem size." << std::endl; } // Using arrays as temporary memory. array arr_1(element_count, source.begin()); array arr_2((element_count / _tile_size) ? (element_count / _tile_size) : 1); // array_views may be swapped after each iteration. array_view av_src(arr_1); array_view av_dst(arr_2); av_dst.discard_data(); // Reduce using parallel_for_each as long as the sequence length // is evenly divisable to twice the number of threads in the tile // (note each tile works on a problem size twice its own size). while (element_count >= _tile_size && (element_count % (_tile_size * 2)) == 0) { parallel_for_each(extent<1>(element_count / 2).tile(_tile_size), [=] (tiled_index<1> tidx) [[hc]] [[hc_flat_workgroup_size(_tile_size)]] { // Use tile_static as a scratchpad memory. tile_static float tile_data[_tile_size]; unsigned local_idx = tidx.local[0]; // Partition input data among tiles, // 2 * _tile_size because the number of threads spawned is halved. unsigned rel_idx = tidx.tile[0] * (_tile_size * 2) + local_idx; tile_data[local_idx] = av_src[rel_idx] + av_src[rel_idx + _tile_size]; tidx.barrier.wait(); // Reduce within a tile using multiple threads. for(unsigned s = _tile_size / 2; s > 0; s /= 2) { if (local_idx < s) { tile_data[local_idx] += tile_data[local_idx + s]; } tidx.barrier.wait(); } // Store the tile result in the global memory. if (local_idx == 0) { av_dst[tidx.tile] = tile_data[0]; } }); // Update the sequence length, swap source with destination. element_count /= _tile_size * 2; std::swap(av_src, av_dst); av_dst.discard_data(); } // Perform any remaining uneven reduction on the CPU. std::vector result(element_count); copy(av_src.section(0, element_count), result.begin()); return std::accumulate(result.begin(), result.end(), 0.f); } //---------------------------------------------------------------------------- // Here we take completely different approach by using algorithm cascading // by combining sequential and parallel reduction. //---------------------------------------------------------------------------- template float reduction_cascade(const std::vector& source) { static_assert(_tile_count > 0, "Tile count must be positive!"); static_assert(IS_POWER_OF_2(_tile_size), "Tile size must be a positive integer power of two!"); assert(source.size() <= UINT_MAX); unsigned element_count = static_cast(source.size()); assert(element_count != 0); // Cannot reduce an empty sequence. unsigned stride = _tile_size * _tile_count * 2; // Reduce tail elements. float tail_sum = 0.f; unsigned tail_length = element_count % stride; if(tail_length != 0) { tail_sum = std::accumulate(source.end() - tail_length, source.end(), 0.f); element_count -= tail_length; if(element_count == 0) { return tail_sum; } } // Using arrays as a temporary memory. array a(element_count, source.begin()); array a_partial_result(_tile_count); parallel_for_each(extent<1>(_tile_count * _tile_size).tile(_tile_size), [=, &a, &a_partial_result] (tiled_index<1> tidx) [[hc]] [[hc_flat_workgroup_size(_tile_size)]] { // Use tile_static as a scratchpad memory. tile_static float tile_data[_tile_size]; unsigned local_idx = tidx.local[0]; // Reduce data strides of twice the tile size into tile_static memory. unsigned input_idx = (tidx.tile[0] * 2 * _tile_size) + local_idx; tile_data[local_idx] = 0; do { tile_data[local_idx] += a[input_idx] + a[input_idx + _tile_size]; input_idx += stride; } while (input_idx < element_count); tidx.barrier.wait(); // Reduce to the tile result using multiple threads. for (unsigned stride = _tile_size / 2; stride > 0; stride /= 2) { if (local_idx < stride) { tile_data[local_idx] += tile_data[local_idx + stride]; } tidx.barrier.wait(); } // Store the tile result in the global memory. if (local_idx == 0) { a_partial_result[tidx.tile[0]] = tile_data[0]; } }); // Reduce results from all tiles on the CPU. std::vector v_partial_result(_tile_count); copy(a_partial_result, v_partial_result.begin()); return std::accumulate(v_partial_result.begin(), v_partial_result.end(), tail_sum); } //---------------------------------------------------------------------------- // Helper function comparing floating point numbers within a given relative // difference. //---------------------------------------------------------------------------- bool fp_equal(float a, float b, float max_rel_diff) { float diff = std::fabs(a - b); a = std::fabs(a); b = std::fabs(b); return diff <= std::max(a, b) * max_rel_diff; } //---------------------------------------------------------------------------- // Program entry point. //---------------------------------------------------------------------------- int main() { int ret = 0; accelerator default_device; std::wcout << "Using device : " << default_device.get_description() << std::endl; // if (default_device == accelerator(accelerator::direct3d_ref)) // std::cout << "WARNING!! Running on very slow emulator! Only use this accelerator for debugging." << std::endl; // Make sure that elements can be split into tiles so the number of // tiles in any dimension is less than 65536. Here we we have // element_count == 16777216 so the number of tiles: // tile_count = element_count / tile_size == 32768 < 65536 unsigned element_count = 16 * 1024 * 1024; std::vector source(element_count); for (unsigned i = 0; i < element_count; ++i) { // Element range is limited to avoid overflow or underflow source[i] = (i & 0xf) * 0.01f; } // The data is generated in a pattern and its sum can be computed by the following formula const float expected_result = ((element_count / 16) * ((15 * 16) / 2)) * 0.01f; std::cout << "Running kernels..." << std::endl; const unsigned tile_size = 512; typedef float (*ReductionFunction)(const std::vector&); typedef std::pair user_pair; std::vector functions; functions.push_back(user_pair(sequential_reduction, "sequential_reduction")); functions.push_back(user_pair(reduction_tiled_1, "reduction_tiled_1")); functions.push_back(user_pair(reduction_tiled_2, "reduction_tiled_2")); functions.push_back(user_pair(reduction_tiled_3, "reduction_tiled_3")); functions.push_back(user_pair(reduction_tiled_4, "reduction_tiled_4")); functions.push_back(user_pair(reduction_cascade, "reduction_cascade")); for(const auto& func : functions) { float result = func.first(source); if (fp_equal(result, expected_result, 0.05f)) { std::cout << "SUCCESS: " << func.second << "." << std::endl; } else { std::cout << "FAILED: " << func.second << " expected " << expected_result << " but found " << result << "!" << std::endl; ret = 1; } } return ret; }