/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * Thread utilities for reading memory using PTX cache modifiers. */ #pragma once #include #include #include "../util_ptx.cuh" #include "../util_type.cuh" #include "../util_namespace.cuh" /// Optional outer namespace(s) THRUST_CUB_NS_PREFIX /// CUB namespace namespace cub { /** * \addtogroup UtilIo * @{ */ //----------------------------------------------------------------------------- // Tags and constants //----------------------------------------------------------------------------- /** * \brief Enumeration of cache modifiers for memory load operations. */ enum CacheLoadModifier { LOAD_DEFAULT, ///< Default (no modifier) LOAD_CA, ///< Cache at all levels LOAD_CG, ///< Cache at global level LOAD_CS, ///< Cache streaming (likely to be accessed once) LOAD_CV, ///< Cache as volatile (including cached system lines) LOAD_LDG, ///< Cache as texture LOAD_VOLATILE, ///< Volatile (any memory space) }; /** * \name Thread I/O (cache modified) * @{ */ /** * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type. * * \par Example * \code * #include // or equivalently * * // 32-bit load using cache-global modifier: * int *d_in; * int val = cub::ThreadLoad(d_in + threadIdx.x); * * // 16-bit load using default modifier * short *d_in; * short val = cub::ThreadLoad(d_in + threadIdx.x); * * // 256-bit load using cache-volatile modifier * double4 *d_in; * double4 val = cub::ThreadLoad(d_in + threadIdx.x); * * // 96-bit load using cache-streaming modifier * struct TestFoo { bool a; short b; }; * TestFoo *d_struct; * TestFoo val = cub::ThreadLoad(d_in + threadIdx.x); * \endcode * * \tparam MODIFIER [inferred] CacheLoadModifier enumeration * \tparam InputIteratorT [inferred] Input iterator type \iterator */ template < CacheLoadModifier MODIFIER, typename InputIteratorT> __device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIteratorT itr); //@} end member group #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /// Helper structure for templated load iteration (inductive case) template struct IterateThreadLoad { template static __device__ __forceinline__ void Load(T const *ptr, T *vals) { vals[COUNT] = ThreadLoad(ptr + COUNT); IterateThreadLoad::template Load(ptr, vals); } template static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals) { vals[COUNT] = itr[COUNT]; IterateThreadLoad::Dereference(itr, vals); } }; /// Helper structure for templated load iteration (termination case) template struct IterateThreadLoad { template static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {} template static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {} }; /** * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier */ #define _CUB_LOAD_16(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ uint4 ThreadLoad(uint4 const *ptr) \ { \ uint4 retval; \ asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \ "=r"(retval.x), \ "=r"(retval.y), \ "=r"(retval.z), \ "=r"(retval.w) : \ _CUB_ASM_PTR_(ptr)); \ return retval; \ } \ template<> \ __device__ __forceinline__ ulonglong2 ThreadLoad(ulonglong2 const *ptr) \ { \ ulonglong2 retval; \ asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \ "=l"(retval.x), \ "=l"(retval.y) : \ _CUB_ASM_PTR_(ptr)); \ return retval; \ } /** * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier */ #define _CUB_LOAD_8(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ ushort4 ThreadLoad(ushort4 const *ptr) \ { \ ushort4 retval; \ asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" : \ "=h"(retval.x), \ "=h"(retval.y), \ "=h"(retval.z), \ "=h"(retval.w) : \ _CUB_ASM_PTR_(ptr)); \ return retval; \ } \ template<> \ __device__ __forceinline__ uint2 ThreadLoad(uint2 const *ptr) \ { \ uint2 retval; \ asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" : \ "=r"(retval.x), \ "=r"(retval.y) : \ _CUB_ASM_PTR_(ptr)); \ return retval; \ } \ template<> \ __device__ __forceinline__ unsigned long long ThreadLoad(unsigned long long const *ptr) \ { \ unsigned long long retval; \ asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" : \ "=l"(retval) : \ _CUB_ASM_PTR_(ptr)); \ return retval; \ } /** * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier */ #define _CUB_LOAD_4(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ unsigned int ThreadLoad(unsigned int const *ptr) \ { \ unsigned int retval; \ asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \ "=r"(retval) : \ _CUB_ASM_PTR_(ptr)); \ return retval; \ } /** * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier */ #define _CUB_LOAD_2(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ unsigned short ThreadLoad(unsigned short const *ptr) \ { \ unsigned short retval; \ asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \ "=h"(retval) : \ _CUB_ASM_PTR_(ptr)); \ return retval; \ } /** * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier */ #define _CUB_LOAD_1(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ unsigned char ThreadLoad(unsigned char const *ptr) \ { \ unsigned short retval; \ asm volatile ( \ "{" \ " .reg .u8 datum;" \ " ld."#ptx_modifier".u8 datum, [%1];" \ " cvt.u16.u8 %0, datum;" \ "}" : \ "=h"(retval) : \ _CUB_ASM_PTR_(ptr)); \ return (unsigned char) retval; \ } /** * Define powers-of-two ThreadLoad specializations for the given Cache load modifier */ #define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \ _CUB_LOAD_16(cub_modifier, ptx_modifier) \ _CUB_LOAD_8(cub_modifier, ptx_modifier) \ _CUB_LOAD_4(cub_modifier, ptx_modifier) \ _CUB_LOAD_2(cub_modifier, ptx_modifier) \ _CUB_LOAD_1(cub_modifier, ptx_modifier) \ /** * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers */ #if CUB_PTX_ARCH >= 200 _CUB_LOAD_ALL(LOAD_CA, ca) _CUB_LOAD_ALL(LOAD_CG, cg) _CUB_LOAD_ALL(LOAD_CS, cs) _CUB_LOAD_ALL(LOAD_CV, cv) #else _CUB_LOAD_ALL(LOAD_CA, global) // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1 _CUB_LOAD_ALL(LOAD_CG, volatile.global) _CUB_LOAD_ALL(LOAD_CS, global) _CUB_LOAD_ALL(LOAD_CV, volatile.global) #endif #if CUB_PTX_ARCH >= 350 _CUB_LOAD_ALL(LOAD_LDG, global.nc) #else _CUB_LOAD_ALL(LOAD_LDG, global) #endif // Macro cleanup #undef _CUB_LOAD_ALL #undef _CUB_LOAD_1 #undef _CUB_LOAD_2 #undef _CUB_LOAD_4 #undef _CUB_LOAD_8 #undef _CUB_LOAD_16 /** * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types */ template __device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad( InputIteratorT itr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { return *itr; } /** * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types */ template __device__ __forceinline__ T ThreadLoad( T *ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { return *ptr; } /** * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types */ template __device__ __forceinline__ T ThreadLoadVolatilePointer( T *ptr, Int2Type /*is_primitive*/) { T retval = *reinterpret_cast(ptr); return retval; } /** * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types */ template __device__ __forceinline__ T ThreadLoadVolatilePointer( T *ptr, Int2Type /*is_primitive*/) { typedef typename UnitWord::VolatileWord VolatileWord; // Word type for memcopying const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); /* VolatileWord words[VOLATILE_MULTIPLE]; IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( reinterpret_cast(ptr), words); return *reinterpret_cast(words); */ T retval; VolatileWord *words = reinterpret_cast(&retval); IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( reinterpret_cast(ptr), words); return retval; } /** * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types */ template __device__ __forceinline__ T ThreadLoad( T *ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { // Apply tags for partial-specialization return ThreadLoadVolatilePointer(ptr, Int2Type::PRIMITIVE>()); } /** * ThreadLoad definition for generic modifiers on pointer types */ template __device__ __forceinline__ T ThreadLoad( T const *ptr, Int2Type /*modifier*/, Int2Type /*is_pointer*/) { typedef typename UnitWord::DeviceWord DeviceWord; const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); DeviceWord words[DEVICE_MULTIPLE]; IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load( reinterpret_cast(const_cast(ptr)), words); return *reinterpret_cast(words); } /** * ThreadLoad definition for generic modifiers */ template < CacheLoadModifier MODIFIER, typename InputIteratorT> __device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIteratorT itr) { // Apply tags for partial-specialization return ThreadLoad( itr, Int2Type(), Int2Type::VALUE>()); } #endif // DOXYGEN_SHOULD_SKIP_THIS /** @} */ // end group UtilIo } // CUB namespace THRUST_CUB_NS_POSTFIX // Optional outer namespace(s)