/****************************************************************************** * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include BEGIN_NS_THRUST // forward declare generic reduce // to circumvent circular dependency template T __host__ __device__ reduce(const thrust::detail::execution_policy_base &exec, InputIterator first, InputIterator last, T init, BinaryFunction binary_op); namespace cuda_cub { namespace __reduce { // XXX should GridSizeType also be able accomodate 64 bit integers typedef int GridSizeType; template struct is_true : detail::false_type {}; template<> struct is_true : detail::true_type {}; template struct PtxPolicy { enum { BLOCK_THREADS = _BLOCK_THREADS, ITEMS_PER_THREAD = _ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, MIN_BLOCKS = _MIN_BLOCKS, ITEMS_PER_TILE = _BLOCK_THREADS * _ITEMS_PER_THREAD }; static const cub::BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; static const cub::GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; }; // struct PtxPolicy template struct Tuning; template struct Tuning { enum { // Relative size of T type to a 4-byte word SCALE_FACTOR_4B = (sizeof(T) + 3) / 4, // Relative size of T type to a 1-byte word SCALE_FACTOR_1B = sizeof(T), }; typedef PtxPolicy<256, CUB_MAX(1, 20 / SCALE_FACTOR_4B), 2, cub::BLOCK_REDUCE_WARP_REDUCTIONS, cub::LOAD_DEFAULT, cub::GRID_MAPPING_RAKE> type; }; // Tuning sm30 template struct Tuning : Tuning { // ReducePolicy1B (GTX Titan: 228.7 GB/s @ 192M 1B items) typedef PtxPolicy<128, CUB_MAX(1, 24 / Tuning::SCALE_FACTOR_1B), 4, cub::BLOCK_REDUCE_WARP_REDUCTIONS, cub::LOAD_LDG, cub::GRID_MAPPING_DYNAMIC> ReducePolicy1B; // ReducePolicy4B types (GTX Titan: 255.1 GB/s @ 48M 4B items) typedef PtxPolicy<256, CUB_MAX(1, 20 / Tuning::SCALE_FACTOR_4B), 4, cub::BLOCK_REDUCE_WARP_REDUCTIONS, cub::LOAD_LDG, cub::GRID_MAPPING_DYNAMIC> ReducePolicy4B; typedef typename detail::conditional<(sizeof(T) < 4), ReducePolicy1B, ReducePolicy4B>::type type; }; // Tuning sm35 template struct ReduceAgent { template struct PtxPlan : Tuning::type { // we need this type definition to indicate "specialize_plan" metafunction // that this PtxPlan may have specializations for different Arch // via Tuning type. // typedef Tuning tuning; typedef typename cub::CubVector Vector; typedef typename core::LoadIterator::type LoadIt; typedef cub::BlockReduce BlockReduce; typedef cub::CacheModifiedInputIterator VectorLoadIt; struct TempStorage { typename BlockReduce::TempStorage reduce; // Size dequeue_offset; }; // struct TempStorage }; // struct PtxPlan // Reduction need additional information which is not covered in // default core::AgentPlan. We thus inherit from core::AgentPlan // and add additional member fields that are needed. // Other algorithms, e.g. merge, may not need additional information, // and may use AgentPlan directly, instead of defining their own Plan type. // struct Plan : core::AgentPlan { cub::GridMappingStrategy grid_mapping; template THRUST_RUNTIME_FUNCTION Plan(P) : core::AgentPlan(P()), grid_mapping(P::GRID_MAPPING) { } }; // this specialized PtxPlan for a device-compiled Arch // ptx_plan type *must* only be used from device code // Its use from host code will result in *undefined behaviour* // typedef typename core::specialize_plan_msvc10_war::type::type ptx_plan; typedef typename ptx_plan::TempStorage TempStorage; typedef typename ptx_plan::Vector Vector; typedef typename ptx_plan::LoadIt LoadIt; typedef typename ptx_plan::BlockReduce BlockReduce; typedef typename ptx_plan::VectorLoadIt VectorLoadIt; enum { ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD, BLOCK_THREADS = ptx_plan::BLOCK_THREADS, ITEMS_PER_TILE = ptx_plan::ITEMS_PER_TILE, VECTOR_LOAD_LENGTH = ptx_plan::VECTOR_LOAD_LENGTH, ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) && (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) && detail::is_pointer::value && detail::is_arithmetic< typename detail::remove_cv >::value }; struct impl { //--------------------------------------------------------------------- // Per thread data //--------------------------------------------------------------------- TempStorage &storage; InputIt input_it; LoadIt load_it; ReductionOp reduction_op; //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- THRUST_DEVICE_FUNCTION impl(TempStorage &storage_, InputIt input_it_, ReductionOp reduction_op_) : storage(storage_), input_it(input_it_), load_it(core::make_load_iterator(ptx_plan(), input_it)), reduction_op(reduction_op_) {} //--------------------------------------------------------------------- // Utility //--------------------------------------------------------------------- // Whether or not the input is aligned with the vector type // (specialized for types we can vectorize) // template static THRUST_DEVICE_FUNCTION bool is_aligned(Iterator d_in, detail::true_type /* can_vectorize */) { return (size_t(d_in) & (sizeof(Vector) - 1)) == 0; } // Whether or not the input is aligned with the vector type // (specialized for types we cannot vectorize) // template static THRUST_DEVICE_FUNCTION bool is_aligned(Iterator, detail::false_type /* can_vectorize */) { return false; } //--------------------------------------------------------------------- // Tile processing //--------------------------------------------------------------------- // Consume a full tile of input (non-vectorized) // template THRUST_DEVICE_FUNCTION void consume_tile(T & thread_aggregate, Size block_offset, int /*valid_items*/, detail::true_type /* is_full_tile */, detail::false_type /* can_vectorize */) { T items[ITEMS_PER_THREAD]; // Load items in striped fashion cub::LoadDirectStriped(threadIdx.x, load_it + block_offset, items); // Reduce items within each thread stripe thread_aggregate = (IS_FIRST_TILE) ? cub::internal::ThreadReduce(items, reduction_op) : cub::internal::ThreadReduce(items, reduction_op, thread_aggregate); } // Consume a full tile of input (vectorized) // template THRUST_DEVICE_FUNCTION void consume_tile(T & thread_aggregate, Size block_offset, int /*valid_items*/, detail::true_type /* is_full_tile */, detail::true_type /* can_vectorize */) { // Alias items as an array of VectorT and load it in striped fashion enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH }; T items[ITEMS_PER_THREAD]; Vector *vec_items = reinterpret_cast(items); // Vector Input iterator wrapper type (for applying cache modifier) T *d_in_unqualified = const_cast(input_it) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH); VectorLoadIt vec_load_it(reinterpret_cast(d_in_unqualified)); #pragma unroll for (int i = 0; i < WORDS; ++i) { vec_items[i] = vec_load_it[BLOCK_THREADS * i]; } // Reduce items within each thread stripe thread_aggregate = (IS_FIRST_TILE) ? cub::internal::ThreadReduce(items, reduction_op) : cub::internal::ThreadReduce(items, reduction_op, thread_aggregate); } // Consume a partial tile of input // template THRUST_DEVICE_FUNCTION void consume_tile(T & thread_aggregate, Size block_offset, int valid_items, detail::false_type /* is_full_tile */, CAN_VECTORIZE) { // Partial tile int thread_offset = threadIdx.x; // Read first item if ((IS_FIRST_TILE) && (thread_offset < valid_items)) { thread_aggregate = load_it[block_offset + thread_offset]; thread_offset += BLOCK_THREADS; } // Continue reading items (block-striped) while (thread_offset < valid_items) { thread_aggregate = reduction_op( thread_aggregate, thrust::raw_reference_cast(load_it[block_offset + thread_offset])); thread_offset += BLOCK_THREADS; } } //--------------------------------------------------------------- // Consume a contiguous segment of tiles //--------------------------------------------------------------------- // Reduce a contiguous segment of input tiles // template THRUST_DEVICE_FUNCTION T consume_range_impl(Size block_offset, Size block_end, CAN_VECTORIZE can_vectorize) { T thread_aggregate; if (block_offset + ITEMS_PER_TILE > block_end) { // First tile isn't full (not all threads have valid items) int valid_items = block_end - block_offset; consume_tile(thread_aggregate, block_offset, valid_items, detail::false_type(), can_vectorize); return BlockReduce(storage.reduce) .Reduce(thread_aggregate, reduction_op, valid_items); } // At least one full block consume_tile(thread_aggregate, block_offset, ITEMS_PER_TILE, detail::true_type(), can_vectorize); block_offset += ITEMS_PER_TILE; // Consume subsequent full tiles of input while (block_offset + ITEMS_PER_TILE <= block_end) { consume_tile(thread_aggregate, block_offset, ITEMS_PER_TILE, detail::true_type(), can_vectorize); block_offset += ITEMS_PER_TILE; } // Consume a partially-full tile if (block_offset < block_end) { int valid_items = block_end - block_offset; consume_tile(thread_aggregate, block_offset, valid_items, detail::false_type(), can_vectorize); } // Compute block-wide reduction (all threads have valid items) return BlockReduce(storage.reduce) .Reduce(thread_aggregate, reduction_op); } // Reduce a contiguous segment of input tiles // THRUST_DEVICE_FUNCTION T consume_range(Size block_offset, Size block_end) { typedef is_true attempt_vec; typedef is_true path_a; typedef is_true path_b; return is_aligned(input_it + block_offset, attempt_vec()) ? consume_range_impl(block_offset, block_end, path_a()) : consume_range_impl(block_offset, block_end, path_b()); } // Reduce a contiguous segment of input tiles // THRUST_DEVICE_FUNCTION T consume_tiles(Size /*num_items*/, cub::GridEvenShare &even_share, cub::GridQueue & /*queue*/, detail::integral_constant /*is_rake*/) { typedef is_true attempt_vec; typedef is_true path_a; typedef is_true path_b; // Initialize even-share descriptor for this thread block even_share .template BlockInit(); return is_aligned(input_it, attempt_vec()) ? consume_range_impl(even_share.block_offset, even_share.block_end, path_a()) : consume_range_impl(even_share.block_offset, even_share.block_end, path_b()); } //--------------------------------------------------------------------- // Dynamically consume tiles //--------------------------------------------------------------------- // Dequeue and reduce tiles of items as part of a inter-block reduction // template THRUST_DEVICE_FUNCTION T consume_tiles_impl(Size num_items, cub::GridQueue queue, CAN_VECTORIZE can_vectorize) { using core::sync_threadblock; // We give each thread block at least one tile of input. T thread_aggregate; Size block_offset = blockIdx.x * ITEMS_PER_TILE; Size even_share_base = gridDim.x * ITEMS_PER_TILE; if (block_offset + ITEMS_PER_TILE > num_items) { // First tile isn't full (not all threads have valid items) int valid_items = num_items - block_offset; consume_tile(thread_aggregate, block_offset, valid_items, detail::false_type(), can_vectorize); return BlockReduce(storage.reduce) .Reduce(thread_aggregate, reduction_op, valid_items); } // Consume first full tile of input consume_tile(thread_aggregate, block_offset, ITEMS_PER_TILE, detail::true_type(), can_vectorize); if (num_items > even_share_base) { // Dequeue a tile of items if (threadIdx.x == 0) storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) + even_share_base; sync_threadblock(); // Grab tile offset and check if we're done with full tiles block_offset = storage.dequeue_offset; // Consume more full tiles while (block_offset + ITEMS_PER_TILE <= num_items) { consume_tile(thread_aggregate, block_offset, ITEMS_PER_TILE, detail::true_type(), can_vectorize); sync_threadblock(); // Dequeue a tile of items if (threadIdx.x == 0) storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) + even_share_base; sync_threadblock(); // Grab tile offset and check if we're done with full tiles block_offset = storage.dequeue_offset; } // Consume partial tile if (block_offset < num_items) { int valid_items = num_items - block_offset; consume_tile(thread_aggregate, block_offset, valid_items, detail::false_type(), can_vectorize); } } // Compute block-wide reduction (all threads have valid items) return BlockReduce(storage.reduce) .Reduce(thread_aggregate, reduction_op); } // Dequeue and reduce tiles of items as part of a inter-block reduction // THRUST_DEVICE_FUNCTION T consume_tiles( Size num_items, cub::GridEvenShare &/*even_share*/, cub::GridQueue & queue, detail::integral_constant) { typedef is_true attempt_vec; typedef is_true path_a; typedef is_true path_b; return is_aligned(input_it, attempt_vec()) ? consume_tiles_impl(num_items, queue, path_a()) : consume_tiles_impl(num_items, queue, path_b()); } }; // struct impl //--------------------------------------------------------------------- // Agent entry points //--------------------------------------------------------------------- // single tile reduce entry point // THRUST_AGENT_ENTRY(InputIt input_it, OutputIt output_it, Size num_items, ReductionOp reduction_op, char * shmem) { TempStorage& storage = *reinterpret_cast(shmem); if (num_items == 0) { return; } T block_aggregate = impl(storage, input_it, reduction_op).consume_range((Size)0, num_items); if (threadIdx.x == 0) *output_it = block_aggregate; } // single tile reduce entry point // THRUST_AGENT_ENTRY(InputIt input_it, OutputIt output_it, Size num_items, ReductionOp reduction_op, T init, char * shmem) { TempStorage& storage = *reinterpret_cast(shmem); if (num_items == 0) { if (threadIdx.x == 0) *output_it = init; return; } T block_aggregate = impl(storage, input_it, reduction_op).consume_range((Size)0, num_items); if (threadIdx.x == 0) *output_it = reduction_op(init, block_aggregate); } THRUST_AGENT_ENTRY(InputIt input_it, OutputIt output_it, Size num_items, cub::GridEvenShare even_share, cub::GridQueue queue, ReductionOp reduction_op, char * shmem) { TempStorage& storage = *reinterpret_cast(shmem); typedef detail::integral_constant grid_mapping; T block_aggregate = impl(storage, input_it, reduction_op) .consume_tiles(num_items, even_share, queue, grid_mapping()); if (threadIdx.x == 0) output_it[blockIdx.x] = block_aggregate; } }; // struct ReduceAgent template struct DrainAgent { template struct PtxPlan : PtxPolicy<1> {}; typedef core::specialize_plan ptx_plan; //--------------------------------------------------------------------- // Agent entry point //--------------------------------------------------------------------- THRUST_AGENT_ENTRY(cub::GridQueue grid_queue, Size num_items, char * /*shmem*/) { grid_queue.FillAndResetDrain(num_items); } }; // struct DrainAgent; template cudaError_t THRUST_RUNTIME_FUNCTION doit_step(void * d_temp_storage, size_t & temp_storage_bytes, InputIt input_it, Size num_items, T init, ReductionOp reduction_op, OutputIt output_it, cudaStream_t stream, bool debug_sync) { using core::AgentPlan; using core::AgentLauncher; using core::get_agent_plan; using core::cuda_optional; if (num_items == 0) return cudaErrorNotSupported; typedef AgentLauncher< ReduceAgent > reduce_agent; typename reduce_agent::Plan reduce_plan = reduce_agent::get_plan(stream); cudaError_t status = cudaSuccess; if (num_items <= reduce_plan.items_per_tile) { size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, 1); // small, single tile size if (d_temp_storage == NULL) { temp_storage_bytes = max(1, vshmem_size); return status; } char *vshmem_ptr = vshmem_size > 0 ? (char*)d_temp_storage : NULL; reduce_agent ra(reduce_plan, num_items, stream, vshmem_ptr, "reduce_agent: single_tile only", debug_sync); ra.launch(input_it, output_it, num_items, reduction_op, init); CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); } else { // regular size cuda_optional sm_count = core::get_sm_count(); CUDA_CUB_RET_IF_FAIL(sm_count.status()); // reduction will not use more cta counts than requested cuda_optional max_blocks_per_sm = reduce_agent:: template get_max_blocks_per_sm, cub::GridQueue, ReductionOp>(reduce_plan); CUDA_CUB_RET_IF_FAIL(max_blocks_per_sm.status()); int reduce_device_occupancy = (int)max_blocks_per_sm * sm_count; int sm_oversubscription = 5; int max_blocks = reduce_device_occupancy * sm_oversubscription; cub::GridEvenShare even_share; even_share.DispatchInit(static_cast(num_items), max_blocks, reduce_plan.items_per_tile); // we will launch at most "max_blocks" blocks in a grid // so preallocate virtual shared memory storage for this if required // size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, max_blocks); // Temporary storage allocation requirements void * allocations[3] = {NULL, NULL, NULL}; size_t allocation_sizes[3] = { max_blocks * sizeof(T), // bytes needed for privatized block reductions cub::GridQueue::AllocationSize(), // bytes needed for grid queue descriptor0 vshmem_size // size of virtualized shared memory storage }; status = cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); CUDA_CUB_RET_IF_FAIL(status); if (d_temp_storage == NULL) { return status; } T *d_block_reductions = (T*) allocations[0]; cub::GridQueue queue(allocations[1]); char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[2] : NULL; // Get grid size for device_reduce_sweep_kernel int reduce_grid_size = 0; if (reduce_plan.grid_mapping == cub::GRID_MAPPING_RAKE) { // Work is distributed evenly reduce_grid_size = even_share.grid_size; } else if (reduce_plan.grid_mapping == cub::GRID_MAPPING_DYNAMIC) { // Work is distributed dynamically size_t num_tiles = (num_items + reduce_plan.items_per_tile - 1) / reduce_plan.items_per_tile; // if not enough to fill the device with threadblocks // then fill the device with threadblocks reduce_grid_size = static_cast(min(num_tiles, static_cast(reduce_device_occupancy))); typedef AgentLauncher > drain_agent; AgentPlan drain_plan = drain_agent::get_plan(); drain_plan.grid_size = 1; drain_agent da(drain_plan, stream, "__reduce::drain_agent", debug_sync); da.launch(queue, num_items); CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); } else { CUDA_CUB_RET_IF_FAIL(cudaErrorNotSupported); } reduce_plan.grid_size = reduce_grid_size; reduce_agent ra(reduce_plan, stream, vshmem_ptr, "reduce_agent: regular size reduce", debug_sync); ra.launch(input_it, d_block_reductions, num_items, even_share, queue, reduction_op); CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); typedef AgentLauncher< ReduceAgent > reduce_agent_single; reduce_plan.grid_size = 1; reduce_agent_single ra1(reduce_plan, stream, vshmem_ptr, "reduce_agent: single tile reduce", debug_sync); ra1.launch(d_block_reductions, output_it, reduce_grid_size, reduction_op, init); CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); } return status; } // func doit_step template T THRUST_RUNTIME_FUNCTION reduce(Policy & policy, InputIt first, Size num_items, T init, BinaryOp binary_op) { if (num_items == 0) return init; char * d_temp_storage = NULL; size_t temp_storage_bytes = 0; cudaStream_t stream = cuda_cub::stream(policy); T * d_result = NULL; bool debug_sync = THRUST_DEBUG_SYNC_FLAG; cudaError_t status; status = doit_step(d_temp_storage, temp_storage_bytes, first, num_items, init, binary_op, d_result, stream, debug_sync); cuda_cub::throw_on_error(status, "reduce failed on 1st step"); size_t allocation_sizes[2] = {sizeof(T*), temp_storage_bytes}; void * allocations[2] = {NULL, NULL}; size_t storage_size = 0; status = core::alias_storage(NULL, storage_size, allocations, allocation_sizes); void *ptr = cuda_cub::get_memory_buffer(policy, storage_size); cuda_cub::throw_on_error(cudaGetLastError(), "reduce failed to get memory buffer"); status = core::alias_storage(ptr, storage_size, allocations, allocation_sizes); d_result = (T *)allocations[0]; d_temp_storage = (char *)allocations[1]; status = doit_step(d_temp_storage, temp_storage_bytes, first, num_items, init, binary_op, d_result, stream, debug_sync); cuda_cub::throw_on_error(status, "reduce failed on 2nd step"); status = cuda_cub::synchronize(policy); cuda_cub::throw_on_error(status, "reduce failed to synchronize"); T result = cuda_cub::get_value(policy, d_result); cuda_cub::return_memory_buffer(policy, ptr); cuda_cub::throw_on_error(cudaGetLastError(), "reduce failed to return memory buffer"); return result; } } // namespace __reduce //------------------------- // Thrust API entry points //------------------------- __thrust_exec_check_disable__ template T __host__ __device__ reduce_n(execution_policy &policy, InputIt first, Size num_items, T init, BinaryOp binary_op) { cudaStream_t stream = cuda_cub::stream(policy); if (__THRUST_HAS_CUDART__) { // Determine temporary device storage requirements. T* ret_ptr = NULL; size_t tmp_size = 0; cuda_cub::throw_on_error( cub::DeviceReduce::Reduce(NULL, tmp_size, first, ret_ptr, num_items, binary_op, init, stream, THRUST_DEBUG_SYNC_FLAG), "after reduction step 1"); // Allocate temporary storage. detail::temporary_array tmp(policy, sizeof(T) + tmp_size); // Run reduction. // `tmp.begin()` yields a `normal_iterator`, which dereferences to a // `reference`, which has an `operator&` that returns a `pointer`, which // has a `.get` method that returns a raw pointer, which we can (finally) // `static_cast` to `void*`. // // The array was dynamically allocated, so we assume that it's suitably // aligned for any type of data. `malloc`/`cudaMalloc`/`new`/`std::allocator` // make this guarantee. ret_ptr = detail::aligned_reinterpret_cast((&*tmp.begin()).get()); void* tmp_ptr = static_cast((&*(tmp.begin() + sizeof(T))).get()); cuda_cub::throw_on_error( cub::DeviceReduce::Reduce(tmp_ptr, tmp_size, first, ret_ptr, num_items, binary_op, init, stream, THRUST_DEBUG_SYNC_FLAG), "after reduction step 2"); // Synchronize the stream and get the value. cuda_cub::throw_on_error(cuda_cub::synchronize(policy), "reduce failed to synchronize"); // `tmp.begin()` yields a `normal_iterator`, which dereferences to a // `reference`, which has an `operator&` that returns a `pointer`, which // has a `.get` method that returns a raw pointer, which we can (finally) // `static_cast` to `void*`. // // The array was dynamically allocated, so we assume that it's suitably // aligned for any type of data. `malloc`/`cudaMalloc`/`new`/`std::allocator` // make this guarantee. return cuda_cub::get_value(policy, detail::aligned_reinterpret_cast((&*tmp.begin()).get())); } #if !__THRUST_HAS_CUDART__ return thrust::reduce( cvt_to_seq(derived_cast(policy)), first, first + num_items, init, binary_op); #endif } template T __host__ __device__ reduce(execution_policy &policy, InputIt first, InputIt last, T init, BinaryOp binary_op) { typedef typename iterator_traits::difference_type size_type; // FIXME: Check for RA iterator. size_type num_items = static_cast(thrust::distance(first, last)); return cuda_cub::reduce_n(policy, first, num_items, init, binary_op); } template T __host__ __device__ reduce(execution_policy &policy, InputIt first, InputIt last, T init) { return cuda_cub::reduce(policy, first, last, init, plus()); } template typename iterator_traits::value_type __host__ __device__ reduce(execution_policy &policy, InputIt first, InputIt last) { typedef typename iterator_traits::value_type value_type; return cuda_cub::reduce(policy, first, last, value_type(0)); } } // namespace cuda_cub END_NS_THRUST #include #include #endif