/************************************************************************* * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ template class Primitives< T, RedOp, Fan, Direct, ProtoSimple > { static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend; static constexpr int Input=0, Output=1; static constexpr int RoleInput = 0x01, RoleOutput = 0x02, RoleWaitRecv = 0x04, RoleWaitSend = 0x08, RolePostSend = 0x10, RolePostRecv = 0x20, Aborted = 0x40, PtrsFifoEnabled = 0x80, SizesFifoEnabled = 0x100, DirectWrite = 0x200, DirectRead = 0x400, ThreadsSynced = 0x800; const int tid; int nthreads; int nworkers; const int stepSize; Fan fan; int index; // Peer index I'm responsible for int flags; int group; uint64_t step; union { void **connPtrsFifoPtr; // (flags & PtrsFifoEnabled) T *userBuff; // (flags & (RoleInput|RoleOutput)) T *connEltsFifo; // !(flags & (PtrsFifoEnabled|RoleInput|RoleOutput)) }; union { int volatile *connSizesFifoPtr; // (flags & SizesFifoEnabled) T *directBuff; // !(flags & SizesFifoEnabled) }; uint64_t volatile *connStepPtr; uint64_t connStepCache; // Cache last seen value of (*connStepPtr) uint64_t* barriers; uint64_t* barrier_next; const uint64_t opCount; // Don't use barrier 0 as it's used by the final sync inline __device__ void barrier() { #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) if (nthreads == WARP_SIZE) __syncwarp(); else barrier_by_group(); #else if (nthreads == WARP_SIZE) __syncwarp(); else asm volatile("bar.sync %0, %1;" :: "r"(group+1), "r"(nthreads)); #endif flags |= ThreadsSynced; } inline __device__ void subBarrier() { #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) barrier(); #else if (nworkers == nthreads) barrier(); else asm volatile("bar.sync %0, %1;" :: "r"(group+2), "r"(nworkers)); #endif } inline __device__ bool checkAbort(int &spins) { spins++; if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) { flags |= atomicAdd_system((unsigned int *)ncclShmem->comm.abortFlag, 0) ? Aborted : 0; spins = 0; } return flags & Aborted; } template __device__ __forceinline__ void waitPeer(intptr_t dstIx, intptr_t remoteIx, int offset, int nelts) { const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send; const bool noRecvWait = DirectRecv && Src && (flags & DirectRead); // no wait when directly reading from remote input const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write #if defined(ENABLE_PROFILING) && !defined(ENABLE_TIMING_PROFILE) uint64_t t0 = __builtin_amdgcn_s_memrealtime(); #endif if (((flags & (Recv*RoleWaitRecv)) && !noRecvWait) || ((flags & (Send*RoleWaitSend)) && !noSendWait)) { int spins = 0; while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) { __builtin_amdgcn_s_sleep(8); connStepCache = LOAD(connStepPtr); if (checkAbort(spins)) break; //if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem->comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice)); } __asm__ __volatile__("s_wakeup"); } if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) { if (isSendNotRecv && (flags & SizesFifoEnabled)) STORE(connSizesFifoPtr+step%NCCL_STEPS, nelts*sizeof(T)); void **ptrs = isSendNotRecv ? (ncclShmem->groups[group].dsts + Dst) : (ncclShmem->groups[group].srcs + Src); if (flags & PtrsFifoEnabled) loadPtr(connPtrsFifoPtr + step%NCCL_STEPS, ptrs[index]); else if (isSendNotRecv && DirectSend) { if (flags & DirectWrite) { ptrs[index] = directBuff + remoteIx + offset; } else if (flags & DirectRead) { // empty send ptrs[index] = nullptr; } else { ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize; } } else if (!isSendNotRecv && DirectRecv) { if (flags & DirectRead) { ptrs[index] = directBuff + remoteIx + offset; } else if (flags & DirectWrite) { ptrs[index] = directBuff + dstIx + offset; // send to next from my output buffer } else { ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize; } } else { ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize; } step += StepPerSlice; #if defined(ENABLE_PROFILING) && !defined(ENABLE_TIMING_PROFILE) if (opCount) { if (isSendNotRecv) ncclShmem->comm.devProf->elems[blockIdx.x].wait_send_cycle += (__builtin_amdgcn_s_memrealtime() - t0); else ncclShmem->comm.devProf->elems[blockIdx.x].wait_recv_cycle += (__builtin_amdgcn_s_memrealtime() - t0); } #endif } } template inline __device__ void postPeer() { if (flags & (Recv*RolePostRecv | Send*RolePostSend)) { step += StepPerSlice; atomicExch_system((unsigned long long *)connStepPtr, step); } } template __device__ __forceinline__ void genericOp( intptr_t srcIx, intptr_t dstIx, intptr_t remoteIx, int nelem, bool postOp ) { constexpr int DirectRecv = 1 && Direct && DirectRecv1; constexpr int DirectSend = 1 && Direct && DirectSend1; constexpr int Src = SrcBuf != -1; constexpr int Dst = DstBuf != -1; nelem = nelem < 0 ? 0 : nelem; int sliceSize = stepSize*StepPerSlice; sliceSize = max(divUp(nelem, 16*SlicePerChunk)*16, sliceSize/32); int slice = 0; int offset = 0; if (tid < nworkers && offset < nelem) { // Worker-only loop for non-empty slices. Non-workers and empty slices are // processed in the loop following this if block. The benefit of splitting // the loop like this is we pull two branches out of the critical path. // Using "number of branch insns (taken or not) encountered dynamically" // as the performance metric, then: // perf_orig = 2*numslices // perf_new = 2+numslices // So the new code and old code behave the same for numslices=2, and for // numslices>2 the new code is superior. And note that in the case // numslices=1, the loop is trivially unrollable (single iteration) so we // don't incur that that tail branch and we still have perf_new=2. // // ORIGINAL CODE: // unrolled for(slices) { // if(worker) { // This branch removed // wait(); // subBarrier(); // if(slice not empty) // This branch removed // ReduceCopyMulti(); // } // barrier(); // post(); // } // Since we no longer unroll, new branch added here #pragma unroll 1 do { sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset; if (Src && (flags & (SrcBuf==Input ? RoleInput : RoleOutput))) ncclShmem->groups[group].srcs[0] = userBuff + srcIx + offset; if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput))) ncclShmem->groups[group].dsts[0] = userBuff + dstIx + offset; #ifdef ENABLE_PROFILING uint64_t t0; if (tid == 0) t0 = __builtin_amdgcn_s_memrealtime(); #endif waitPeer(dstIx, remoteIx, offset, sliceSize); subBarrier(); #ifdef ENABLE_PROFILING if (tid == 0 && opCount) ncclShmem->comm.devProf->elems[blockIdx.x].wait_cycle += (__builtin_amdgcn_s_memrealtime() - t0); #endif if (DirectRecv && ncclShmem->groups[group].srcs[0] == ncclShmem->groups[group].dsts[0]) { // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy if (Send) { // (1-Send) is only there to avoid compilation errors in case MaxSend=0 (and Send=0). ReduceOrCopyMulti (tid, nworkers, nullptr, false, 1, (T const**)ncclShmem->groups[group].srcs, fan.nsend(), (T**)ncclShmem->groups[group].dsts+1, sliceSize); } } else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem->groups[group].dsts[Dst] == nullptr) { // For broadcast in CollNet to do empty send ReduceOrCopyMulti (tid, nworkers, ncclShmem->redOpArgs, postOp, Recv, (T const**)ncclShmem->groups[group].srcs, Dst, (T**)ncclShmem->groups[group].dsts, sliceSize); } else { constexpr int PreOpN = SrcBuf != Input ? 0 : DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1; ReduceOrCopyMulti (tid, nworkers, ncclShmem->redOpArgs, postOp, Recv*fan.nrecv()+Src, (T const**)ncclShmem->groups[group].srcs, Send*fan.nsend()+Dst, (T**)ncclShmem->groups[group].dsts, sliceSize); } barrier(); // This barrier has a counterpart in following loop //if (Send && (flags & RolePostSend) && index == 0) __threadfence_system(); __syncwarp(); postPeer(); offset += sliceSize; slice += 1; } while (slice < SlicePerChunk && offset < nelem); } // Non-workers come straight here. Workers too but only once the remaining // slices are all empty. Since empty slices are the uncommon case, and // worker perf is the limiter, perf-wise this loop is effectively unentered, // hence just a single branch insn. #pragma unroll 1 while (slice < SlicePerChunk) { sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset; { // Only workers could have Wait roles so we know the slice must be empty // since we've exited the loop above. waitPeer(0, 0, 0, 0); } barrier(); // Has couterpart in preceding worker-only loop. //if (Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __threadfence_system(); __syncwarp(); postPeer(); offset += sliceSize; slice += 1; } } // Scatter/Gather generic op template __device__ __forceinline__ void ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int skip, int shift, bool postOp) { constexpr int DirectRecv = 1 && Direct && DirectRecv1; constexpr int DirectSend = 1 && Direct && DirectSend1; int offset = 0; // slice offset int sliceSize = stepSize*StepPerSlice; int dataSize = max(DIVUP(peerElem, 16*SlicePerChunk)*16, sliceSize/32); // per-peer slice size #pragma unroll 1 for (int slice=0; slicegroups[group].srcs[0] = userBuff + inpIx + offset; if (tid == 0) ncclShmem->groups[group].totalSendSize[slice] = 0; // Skip the threadfence // realSize is not accurate here; but intra-node does not rely on sizes FIFO waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize); subBarrier(); #pragma unroll 1 for (int j=0; j= 0 && i >= skip) peerOffset += peerElem; const T* src0 = (T*)ncclShmem->groups[group].srcs[0] + peerOffset; int realPeerSize = min(realSize, totalElem-peerOffset); if (realPeerSize > 0 && ncclShmem->groups[group].dsts[i] != nullptr) { ReduceOrCopyMulti(tid, nworkers, ncclShmem->redOpArgs, false, 1, &src0, 1, (T**)ncclShmem->groups[group].dsts+i, realPeerSize); if (tid == 0) ncclShmem->groups[group].totalSendSize[slice] += realPeerSize; } } } else if (Recv) { if (flags & RoleOutput) ncclShmem->groups[group].dsts[0] = userBuff + outIx + offset; int peerOffset = index*peerElem; if (skip >= 0 && index >= skip) peerOffset += peerElem; // Adjust remote index with peer offset in case we are directly pulling from peer's output buffer waitPeer(outIx, outIx+peerOffset, offset, realSize); subBarrier(); if (DirectRecv && ncclShmem->groups[group].srcs[0] == ncclShmem->groups[group].dsts[0]) { // Since waitPeer sets srcs[0] to output buffer + offset, we are doing a direct-write based recv // Do nothing } else { #pragma unroll 1 for (int j=0; j= 0 && i >= skip) peerOffset += peerElem; T* dst0 = (T*)ncclShmem->groups[group].dsts[0] + peerOffset; int realPeerSize = min(realSize, totalElem-peerOffset); if (realPeerSize > 0) ReduceOrCopyMulti(tid, nworkers, ncclShmem->redOpArgs, postOp, 1, (const T**)ncclShmem->groups[group].srcs+i, 1, &dst0, realPeerSize); } } } } barrier(); if (Send && (flags & RolePostSend) && ncclShmem->groups[group].totalSendSize[slice] > 0 && index == 0) __threadfence_system(); __syncwarp(); postPeer(); offset += realSize; } } __device__ __forceinline__ void loadRecvConn(ncclPeer *peer, int connIndex, struct ncclWorkElem* e) { if (flags & (RoleWaitRecv|RolePostRecv)) { auto *conn = &peer->recv[connIndex].conn; step = conn->step; step = roundUp(step, SlicePerChunk*StepPerSlice); if (flags & RolePostRecv) { connStepPtr = conn->head; STORE(connStepPtr, step); // Return credits in case we rounded up. } if (flags & RoleWaitRecv) { ncclShmem->groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs() connStepPtr = conn->tail; connStepCache = LOAD(connStepPtr); flags |= (conn->ptrsFifo != nullptr) ? PtrsFifoEnabled : 0; if (Direct) { // User buffers have been registered if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) { if (connIndex == 1) { flags |= DirectRead; // scatter-reduce use direct pull } else { flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite : (e->direct & NCCL_DIRECT_READ) ? DirectRead : 0; } } else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) { if (connIndex == 1) { flags |= DirectRead; // scatter-reduce use direct pull } else { // direct read not allowed in non-register case // otherwise, in one-to-multi send, we could mix empty send and intermediate send flags |= (conn->direct & NCCL_DIRECT_WRITE) ? DirectWrite : 0; } } } if (flags & PtrsFifoEnabled) connPtrsFifoPtr = conn->ptrsFifo; else connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; } } } __device__ __forceinline__ void loadSendConn(ncclPeer *peer, int connIndex, struct ncclWorkElem* e) { if (flags & (RoleWaitSend|RolePostSend)) { auto *conn = &peer->send[connIndex].conn; step = conn->step; step = roundUp(step, SlicePerChunk*StepPerSlice); if (flags & RolePostSend) { connStepPtr = conn->tail; } if (flags & RoleWaitSend) { ncclShmem->groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs() connStepPtr = conn->head; connStepCache = LOAD(connStepPtr); flags |= (conn->ptrsFifo != nullptr) ? PtrsFifoEnabled : 0; if (flags & PtrsFifoEnabled) connPtrsFifoPtr = conn->ptrsFifo; else connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; if (conn->sizesFifo != nullptr) { flags |= SizesFifoEnabled; connSizesFifoPtr = conn->sizesFifo; } else if (Direct) { // User buffers have been registered if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) { if (connIndex == 1) { flags |= DirectRead; // scatter-reduce use direct pull } else { flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite : (e->direct & NCCL_DIRECT_READ) ? DirectRead : 0; } } else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) { if (connIndex == 1) { flags |= DirectRead; // scatter-reduce use direct pull } else { // direct read not allowed in non-register case // otherwise, in one-to-multi send, we could mix empty send and intermediate send flags |= (conn->direct & NCCL_DIRECT_WRITE) ? DirectWrite : 0; } } } } } } public: __device__ Primitives( int tid, int nthreads, int const *recvPeers, int const *sendPeers, void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint32_t group=0, struct ncclWorkElem* e = nullptr ): tid(tid), stepSize(ncclShmem->comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)), opCount(ncclShmem->work.elems[0].coll.opCount) { // For send operations, we need an extra warp to overlap the threadfence and the copy this->nthreads = nthreads; this->nworkers = nthreads; this->group = group & (uint16_t)0xFFFF; int connIndex = group >> 16; barriers = &ncclShmem->groups[this->group].barrier; barrier_next = ncclShmem->groups[this->group].barrier_next; int nrecv=0, nsend=0; while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++; while (nsend < MaxSend && sendPeers[nsend] != -1) nsend++; this->fan = Fan(nrecv, nsend); constexpr int ThreadPerSync = 8; static_assert(MaxSend < ThreadPerSync && MaxRecv < ThreadPerSync, "Not enough threads to cover all peers"); int g = tid / ThreadPerSync; int ng = nthreads / ThreadPerSync; index = tid % ThreadPerSync; flags = 0; if (g == 0) { if (index < nrecv) flags |= RoleWaitRecv; if (index == nrecv) flags |= RoleInput; } else if (g == 1) { if (index < nsend) flags |= RoleWaitSend; if (index == nsend) flags |= RoleOutput; } else if (g == ng - 2) { if (index < nrecv) flags |= RolePostRecv; } else if (g == ng - 1) { if (index < nsend) flags |= RolePostSend; } int peer = 0; if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index]; if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index]; loadRecvConn(&ncclShmem->channel.devPeers[peer], connIndex, e); loadSendConn(&ncclShmem->channel.devPeers[peer], connIndex, e); setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkRegElem*)e); } __device__ ~Primitives() { // Ensure ncclShmem->groups[].send/recvConns are available if (!(flags & ThreadsSynced)) barrier(); // Save steps for the next operation if (flags & (RolePostSend|RolePostRecv)) { auto *conns = (flags & RolePostSend) ? ncclShmem->groups[group].sendConns : ncclShmem->groups[group].recvConns; STORE(&conns[index]->step, step); } // Make sure all threads are done writing back conn->step and done using // ncclShmem->groups[group] barrier(); } __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkRegElem* e) { if (flags & RoleInput) { userBuff = (T*)inputBuf; ncclShmem->redOpArgs[0] = redOpArg; // scaler for local input } if (flags & RoleOutput) userBuff = (T*)outputBuf; bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite); bool sendAcceptor = flags == (flags|RoleWaitSend|DirectWrite); bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched) bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead); // receiver accepts direct buffer int regUsed = e != nullptr ? e->elem.regUsed : 0; if (Direct && recvProvider) { int spins = 0; void *volatile *slot = ncclShmem->groups[group].recvConns[index]->ptrExchange; // Wait for consumer to consume previous value before trampling it. while (LOAD(slot) != nullptr && !checkAbort(spins)); directBuff = (T*)outputBuf; // Encode pointer by XOR'ing against some address they definitely wouldn't send // since we want to allow them sending us nullptr while not colliding with // the empty slot value. *slot = reinterpret_cast(reinterpret_cast(directBuff) ^ reinterpret_cast(slot)); } if (Direct && sendAcceptor) { int spins = 0; void *volatile *slot = ncclShmem->groups[group].sendConns[index]->ptrExchange; void *ptr; while (true) { ptr = LOAD(slot); if (ptr != nullptr || checkAbort(spins)) break; } directBuff = regUsed ? (T*)(e->dnOutputs[index]) : reinterpret_cast(reinterpret_cast(ptr) ^ reinterpret_cast(slot)); *slot = nullptr; } if (Direct && sendProvider) { int spins = 0; void *volatile *slot = ncclShmem->groups[group].sendConns[index]->ptrExchange; volatile uint64_t* argSlot0 = ncclShmem->groups[group].sendConns[index]->redOpArgExchange; volatile uint64_t* argSlot1 = ncclShmem->groups[group].sendConns[index]->redOpArgExchange+1; // Wait for consumer to consume previous value before trampling it. while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins)); // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter) // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend) directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf; // Exchange pre-scalers for use in direct pull *argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg; *argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32); // Encode pointer by XOR'ing against some address they definitely wouldn't send // since we want to allow them sending us nullptr while not colliding with // the empty slot value. *slot = reinterpret_cast(reinterpret_cast(directBuff) ^ reinterpret_cast(slot)); } if (Direct && recvAcceptor) { int spins = 0; void *volatile *slot = ncclShmem->groups[group].recvConns[index]->ptrExchange; volatile uint64_t* argSlot0 = ncclShmem->groups[group].recvConns[index]->redOpArgExchange; volatile uint64_t* argSlot1 = ncclShmem->groups[group].recvConns[index]->redOpArgExchange+1; void *ptr; while (true) { ptr = *slot; if (ptr != nullptr || checkAbort(spins)) break; } directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) : reinterpret_cast(reinterpret_cast(ptr) ^ reinterpret_cast(slot)); if (MaxSend != 0) { // reduce group rather than gather group // Store scalers for remote inputs uint64_t arg0, arg1; while (true) { arg0 = *argSlot0; arg1 = *argSlot1; if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break; } ncclShmem->redOpArgs[1+index] = ((arg1 & 0xffffffff)<<32) | (arg0 & 0xffffffff); } *argSlot0 = 0; *argSlot1 = 0; *slot = nullptr; } } __device__ void moveDataPtrs(intptr_t delta) { if (flags & (RoleInput|RoleOutput)) userBuff += delta; } __device__ __forceinline__ void send(intptr_t inpIx, int eltN) { genericOp<0, 0, 0, 1, Input, -1>(inpIx, -1, -1, eltN, false); } __device__ __forceinline__ void sendFromOutput(intptr_t outIx, int eltN) { genericOp<0, 0, 0, 1, Output, -1>(outIx, -1, -1, eltN, false); } __device__ __forceinline__ void directSend(intptr_t inpIx, intptr_t remoteOutIx, int eltN) { genericOp<0, 1, 0, 1, Input, -1>(inpIx, -1, remoteOutIx, eltN, false); } __device__ __forceinline__ void directSendFromOutput(intptr_t outIx, intptr_t remoteOutIx, int eltN) { genericOp<0, 1, 0, 1, Output, -1>(outIx, -1, remoteOutIx, eltN, false); } __device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, -1, eltN, postOp); } __device__ __forceinline__ void directRecv(intptr_t outIx, int eltN) { genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, -1, eltN, /*postOp=*/false); } __device__ __forceinline__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 0, 1, Input, Output>(inpIx, outIx, -1, eltN, postOp); } __device__ __forceinline__ void directCopySend(intptr_t inpIx, intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) { genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, remoteOutIx, eltN, postOp); } __device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, -1, eltN, postOp); } __device__ __forceinline__ void directRecvCopySend(intptr_t outIx, intptr_t remoteOutIx, int eltN) { genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, remoteOutIx, eltN, false); } __device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) { genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, remoteOutIx, eltN, postOp); } __device__ __forceinline__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, -1, eltN, postOp); } __device__ __forceinline__ void recvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, -1, eltN, postOp); } __device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, intptr_t remoteInpIx, int eltN, bool postOp=false) { genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, remoteInpIx, eltN, postOp); } __device__ __forceinline__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) { genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, -1, eltN, postOp); } __device__ __forceinline__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, intptr_t remoteOutIx, int eltN, bool postOp=false) { // Direct is only for the send part genericOp<0, 1, 1, 1, Input, Output>(inpIx, outIx, remoteOutIx, eltN, postOp); } __device__ __forceinline__ void scatter(intptr_t inpIx, int totalElem, int peerElem, int skip, int shift) { ScatterGatherOp<0, 0, 0, 1>(inpIx, -1, totalElem, peerElem, skip, shift, /*postOp=*/false); } __device__ __forceinline__ void directScatter(intptr_t inpIx, int totalElem, int peerElem, int skip, int shift) { ScatterGatherOp<0, 1, 0, 1>(inpIx, -1, totalElem, peerElem, skip, shift, /*postOp=*/false); } __device__ __forceinline__ void gather(intptr_t outIx, int totalElem, int peerElem, int skip, int shift, bool postOp=false) { ScatterGatherOp<0, 0, 1, 0>(-1, outIx, totalElem, peerElem, skip, shift, postOp); } __device__ __forceinline__ void directGather(intptr_t outIx, int totalElem, int peerElem, int skip, int shift) { ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, skip, shift, /*postOp=*/false); } __device__ __forceinline__ void recvSend(int eltN) { genericOp<0, 0, 1, 1, -1, -1>(-1, -1, -1, eltN, /*postOp=*/false); } };