/************************************************************************* * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "core.h" #include "graph.h" #include "topo.h" #include "comm.h" #include "net.h" #include "channel.h" // Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths struct ncclTopoNodeList { struct ncclTopoNode* list[NCCL_TOPO_MAX_NODES]; int count; }; static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode* node, int t, int64_t id, struct ncclTopoLinkList** path) { for (int i=0; inodes[t].count; i++) { if (system->nodes[t].nodes[i].id == id) { *path = node->paths[t]+i; return ncclSuccess; } } WARN("Could not find node of type %d id %lx", t, id); return ncclInternalError; } NCCL_PARAM(NvbDisable, "NVB_DISABLE", 0); static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) { if (baseNode->paths[baseNode->type] == NULL) { NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count)); } // breadth-first search to set all paths to that node in the system struct ncclTopoNodeList nodeList; struct ncclTopoNodeList nextNodeList; nodeList.count = 1; nodeList.list[0] = baseNode; nextNodeList.count = 0; struct ncclTopoLinkList* basePath; NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath)); basePath->count = 0; basePath->width = LOC_WIDTH; basePath->type = PATH_LOC; while (nodeList.count) { nextNodeList.count = 0; for (int n=0; ntype, baseNode->id, &path)); for (int l=0; lnlinks; l++) { struct ncclTopoLink* link = node->links+l; struct ncclTopoNode* remNode = link->remNode; if (remNode->paths[baseNode->type] == NULL) { NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count)); } struct ncclTopoLinkList* remPath; NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath)); float width = std::min(path->width, link->width); if (remPath->width < width) { // Find reverse link for (int l=0; lnlinks; l++) { if (remNode->links[l].remNode == node) { remPath->list[0] = remNode->links+l; break; } } if (remPath->list[0] == NULL) { WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx", remNode->type, remNode->id, remNode->nlinks, node->type, node->id); return ncclInternalError; } // Copy the rest of the path for (int i=0; icount; i++) remPath->list[i+1] = path->list[i]; remPath->count = path->count + 1; remPath->width = width; // Start with path type = link type. PATH and LINK types are supposed to match. // Don't consider LINK_NET as we only care about the NIC->GPU path. int type = link->type == LINK_NET ? LINK_LOC : link->type; // Differentiate between one and multiple PCI switches if (node->type == PCI && remNode->type == PCI) type = PATH_PXB; // Consider a path going through the CPU as PATH_PHB if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB; // Set 1 hop NVLink as NVB //if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB; remPath->type = std::max(path->type, type); // Add to the list for the next iteration if not already in the list // Disallow GPUs as intermediate steps for now if (remNode->type != GPU) { int i; for (i=0; itype], node->id); #else sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id); int offset = strlen(line); #endif for (int t=0; tpaths[t] == NULL) continue; for (int n = 0; nnodes[t].count; n++) { #ifdef ENABLE_TRACE line[0] = 0; int offset = 0; for (int i=0; ipaths[t][n].count; i++) { struct ncclTopoLink* link = node->paths[t][n].list[i]; struct ncclTopoNode* remNode = link->remNode; sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id); offset = strlen(line); } INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].width); #else sprintf(line+offset, "%s/%lX (%d/%f/%s) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, topoPathTypeStr[node->paths[t][n].type]); offset = strlen(line); #endif } } #ifndef ENABLE_TRACE INFO(NCCL_GRAPH, "%s", line); #endif } ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system) { for (int i=0; inodes[GPU].count; i++) { printNodePaths(system, system->nodes[GPU].nodes+i); } for (int i=0; inodes[NET].count; i++) { printNodePaths(system, system->nodes[NET].nodes+i); } return ncclSuccess; } static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) { // Find the closest CPU to a GPU int minHops = 0; int localCpu = -1; struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU]; for (int c=0; cnodes[CPU].count; c++) { int hops = paths[c].count; if (minHops == 0 || hops < minHops) { localCpu = c; minHops = hops; } } if (localCpu == -1) { WARN("Error : could not find CPU close to GPU %d", gpu); return ncclInternalError; } *retCpu = localCpu; return ncclSuccess; } static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int i1, int t2, int i2) { struct ncclTopoNode* cpuNode = system->nodes[CPU].nodes+c; struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1; int l=0; // Node 1 -> CPU for (int i=0; ipaths[CPU][c].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[CPU][c].list[i]; // CPU -> Node 2 for (int i=0; ipaths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i]; // Update path characteristics srcNode->paths[t2][i2].count = l; srcNode->paths[t2][i2].type = std::max(srcNode->paths[CPU][c].type, cpuNode->paths[t2][i2].type); srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width); return ncclSuccess; } // Remove/free paths for a given type static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) { for (int t=0; tnodes[t].count; n++) { struct ncclTopoNode* node = system->nodes[t].nodes+n; free(node->paths[nodeType]); node->paths[nodeType] = NULL; } // Remove links _from_ the given type for (int n=0; nnodes[nodeType].count; n++) { struct ncclTopoNode* node = system->nodes[nodeType].nodes+n; free(node->paths[t]); node->paths[t] = NULL; } } } static const int levelsOldToNew[] = { PATH_LOC, PATH_PIX, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS }; ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelEnv) { if (*level == -1) { int l = -1; if (disableEnv) { char* str = getenv(disableEnv); if (str) { int disable = strtol(str, NULL, 0); if (disable == 1) l = 0; } } if (l == -1) { char* str = getenv(levelEnv); if (str) { for (int i=0; i<=PATH_SYS; i++) { if (strcmp(str, topoPathTypeStr[i]) == 0) { l = i; break; } } // Old style numbering if (l == -1 && str[0] >= '0' && str[0] <= '9') { int oldLevel = strtol(str, NULL, 0); const int maxOldLevel = sizeof(levelsOldToNew)/sizeof(int) - 1; if (oldLevel > maxOldLevel) oldLevel = maxOldLevel; l = levelsOldToNew[oldLevel]; } } } if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]); *level = l >= 0 ? l : -2; } return ncclSuccess; } int ncclTopoUserP2pLevel = -1; ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) { *p2p = 0; if (read) *read = 0; if (intermediateRank) *intermediateRank = -1; // Get GPUs from topology int g1, g2; NCCLCHECK(ncclTopoIdToIndex(system, GPU, id1, &g1)); struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1; if (ncclTopoIdToIndex(system, GPU, id2, &g2) == ncclInternalError) { // GPU not found, we can't use p2p. return ncclSuccess; } // Set intermediate GPU rank, if routing through an intermediate GPU. struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2; if (path->count == 2) { struct ncclTopoNode* intermediateNode = path->list[0]->remNode; if (intermediateNode->type == GPU && intermediateRank) { *intermediateRank = intermediateNode->gpu.rank; } } // In general, use P2P whenever we can. int p2pLevel = PATH_SYS; // User override if (ncclTopoUserP2pLevel == -1) NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL")); if (ncclTopoUserP2pLevel != -2) { p2pLevel = ncclTopoUserP2pLevel; goto compare; } // Don't use P2P through ARM CPUs int arch, vendor, model; NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB; if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { p2pLevel = PATH_PXB; } if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { p2pLevel = PATH_PXB; } compare: // Compute the PCI distance and compare with the p2pLevel. if (path->type <= p2pLevel) *p2p = 1; if (path->type == PATH_NVL) { struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2; // Enable P2P Read for Ampere/NVLink only if (read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1; } return ncclSuccess; } NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2); int ncclTopoUserGdrLevel = -1; ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int netDev, int read, int* useGdr) { *useGdr = 0; // Get GPU and NET int n, g; NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n)); struct ncclTopoNode* net = system->nodes[NET].nodes+n; NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g)); struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; // Check that both the NIC and GPUs support it if (net->net.gdrSupport == 0) return ncclSuccess; if (gpu->gpu.gdrSupport == 0) return ncclSuccess; if (read) { // For reads (sends) only enable under certain conditions int gdrReadParam = ncclParamNetGdrRead(); if (gdrReadParam == 0) return ncclSuccess; if (gdrReadParam < 0) { int nvlink = 0; // Since we don't know whether there are other communicators, // it's better to keep things local if we have a single GPU. if (system->nodes[GPU].count == 1) nvlink = 1; for (int i=0; inodes[GPU].count; i++) { if (i == g) continue; if (gpu->paths[GPU][i].type == PATH_NVL) { nvlink = 1; break; } } if (!nvlink) return ncclSuccess; } } // Check if we are close enough that it makes sense to enable GDR int netGdrLevel = system->netGdrLevel == -2 ? PATH_PXB : system->netGdrLevel; NCCLCHECK(ncclGetLevel(&ncclTopoUserGdrLevel, NULL, "NCCL_NET_GDR_LEVEL")); if (ncclTopoUserGdrLevel != -2) netGdrLevel = ncclTopoUserGdrLevel; else { int arch, vendor, model; NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME) { int i, d1 = -1, d2 = -1; for (i = 0; i < system->nodes[CPU].count; i++) if (system->nodes[GPU].nodes[g].paths[CPU][i].count == 2) break; if (i nodes[CPU].count) d1 = system->nodes[CPU].nodes[i].id; for (i = 0; i < system->nodes[CPU].count; i++) if (system->nodes[NET].nodes[n].paths[CPU][i].count == 2) break; if (i nodes[CPU].count) d2 = system->nodes[CPU].nodes[i].id; if (d1 != -1 && d2 != -1 && d1 == d2 && (system->nodes[GPU].nodes[g].id & 0xf0000) == (system->nodes[NET].nodes[n].net.busId & 0xf0000)) { netGdrLevel = PATH_PHB; } } } int distance = gpu->paths[NET][n].type; if (distance > netGdrLevel) { INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel); return ncclSuccess; } *useGdr = 1; INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d <= %d), read %d", busId, netDev, distance, netGdrLevel, read); return ncclSuccess; } ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) { // Precompute paths between GPUs/NICs. // Remove everything in case we're re-computing for (int t=0; tnodes[CPU].count; c++) { NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system)); } // Set direct paths from/to GPUs. for (int g=0; gnodes[GPU].count; g++) { // Compute paths to GPU g NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system)); // Update path when we don't want to / can't use GPU Direct P2P for (int p=0; pnodes[GPU].count; p++) { int p2p; NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL)); if (p2p == 0) { // Divert all traffic through the CPU int cpu; NCCLCHECK(getLocalCpu(system, g, &cpu)); NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g)); } } if (peerInfos == NULL) continue; // Remove GPUs we can't talk to because of containers. struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].gpu.rank; for (int p=0; pnodes[GPU].count; p++) { if (p == g) continue; struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].gpu.rank; int shm; NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo)); int p2p; NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo)); if (shm == 0 && p2p == 0) { // Mark this peer as inaccessible. We'll trim it later. system->nodes[GPU].nodes[p].paths[GPU][g].count = 0; } } } // Set direct paths from/to NICs. for (int n=0; nnodes[NET].count; n++) { struct ncclTopoNode* netNode = system->nodes[NET].nodes+n; NCCLCHECK(ncclTopoSetPaths(netNode, system)); for (int g=0; gnodes[GPU].count; g++) { // Update path when we dont want to / can't use GPU Direct RDMA. int gdr; NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr)); if (gdr == 0) { // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU int localCpu; NCCLCHECK(getLocalCpu(system, g, &localCpu)); NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g)); NCCLCHECK(addCpuStep(system, localCpu, GPU, g, NET, n)); } } } return ncclSuccess; } ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) { int *domains; int64_t *ids; NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count)); NCCLCHECK(ncclCalloc(&ids, system->nodes[GPU].count)); int myDomain = 0; for (int g=0; gnodes[GPU].count; g++) { struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; domains[g] = g; ids[g] = gpu->id; for (int p=0; ppaths[GPU][p].count > 0) { domains[g] = std::min(domains[g], domains[p]); } } if (gpu->gpu.rank == comm->rank) myDomain = domains[g]; } int ngpus = system->nodes[GPU].count; for (int i=0; inodes[GPU].count /* This one varies over the loops */; g++) { gpu = system->nodes[GPU].nodes+g; if (gpu->id == ids[i]) break; else gpu=NULL; } if (gpu == NULL) { WARN("Could not find id %lx", ids[i]); free(domains); free(ids); return ncclInternalError; } NCCLCHECK(ncclTopoRemoveNode(system, GPU, g)); } // trim low speed port on same NIC for (int i = 0; i < system->nodes[NET].count; i ++) { for (int j = 0; j < system->nodes[NET].count; j ++) { if (i == j) continue; if (system->nodes[NET].nodes[i].net.asic == system->nodes[NET].nodes[j].net.asic) { if (system->nodes[NET].nodes[i].net.width > system->nodes[NET].nodes[j].net.width) system->nodes[NET].nodes[j].net.width = 0; } } } do { int n; for (n=0; nnodes[NET].count; n++) { if (system->nodes[NET].nodes[n].net.width == 0) break; } if (nnodes[NET].count) { NCCLCHECK(ncclTopoRemoveNode(system, NET, n)); } else break; } while (system->nodes[NET].count); int remove = 1; int arch, vendor, model; NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME) { int gdr, ret = 1; int64_t net; for (int g = 0; g < system->nodes[GPU].count; g++) { NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank, &net, 0)); NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr)); if (!gdr) { ret = 0; break; } } if (ret) { bool allXgmi = true; // don't trim NICs unless all GPUs are connected by XGMI for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) { int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev; for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) { if (i == j) continue; int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev; bool isXGMI; NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI)); allXgmi &= isXGMI; } } if (!allXgmi) { remove = 0; system->type |= RCCL_TOPO_GDR_ALL; INFO(NCCL_GRAPH, "GDR is available on all GPUs"); } } } comm->localRanks = system->nodes[GPU].count; if (system->nodes[GPU].count == comm->nRanks && remove) { for (int n=system->nodes[NET].count-1; n>=0; n--) NCCLCHECK(ncclTopoRemoveNode(system, NET, n)); } free(domains); free(ids); return ncclSuccess; } void ncclTopoFree(struct ncclTopoSystem* system) { for (int t=0; tnodes[GPU].nodes[peer].paths[GPU]+g; if (path->type == PATH_NVL) { float nvlWidth = ncclTopoNVLinkSpeed(system->nodes[GPU].nodes[g].gpu.cudaCompCap); *nChannels = 2*std::max(1, (int)(path->width / nvlWidth)); } else { *nChannels = 2; } } else { // Remote rank, use network *nChannels = 1; } return ncclSuccess; } NCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 1); NCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS); static int nextPow2(int v) { int pow2 = 1; while (pow2 < v) pow2 <<= 1; return pow2; } ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels()); comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels()); int minChannels = comm->p2pnChannels; // We need to loop through all local GPUs to have a global picture for (int g=0; gtopo->nodes[GPU].count; g++) { for (int r=0; rnRanks; r++) { int nChannels; NCCLCHECK(ncclTopoGetNchannels(comm->topo, g, r, &nChannels)); if (nChannels >= 0) minChannels = std::min(minChannels, nChannels); } } if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_4P2H_ROME) && !(comm->topo->type & RCCL_TOPO_GDR_ALL) && (comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910)) { // Adjust P2P channels on Rome comm->p2pnChannelsPerPeer = 2; comm->p2pnChannels = 2; } else { // Round to next pow2 nChannelsPerPeer and nChannels comm->p2pnChannelsPerPeer = nextPow2(minChannels); comm->p2pnChannels = nextPow2(comm->p2pnChannels); } // Init channels that weren't used so far for (int c=comm->nChannels; cnChannels, comm->p2pnChannels); c++) NCCLCHECK(initChannel(comm, c)); // We want to spread channels used when there aren't many and progressively // fill the whole space of nChannels. To do so we mirror the bits in the // nChannels space. for (int c=0; cp2pnChannelsPerPeer; c++) { int mirror = 0; for (int b=1, mb=(comm->p2pnChannels>>1); bp2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb; comm->p2pChannels[c] = mirror; } INFO(NCCL_INIT, "%d coll channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer); return ncclSuccess; } ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks) { int ngpus = system->nodes[GPU].count; NCCLCHECK(ncclCalloc(ranks, ngpus)); int nvbGpus = 0; for (int g=0; gnodes[GPU].nodes+g; if (gpu->gpu.rank != rank) continue; for (int p=0; ppaths[GPU][p].type == PATH_NVB) { (*ranks)[nvbGpus++] = system->nodes[GPU].nodes[p].gpu.rank; } } } *nranks = nvbGpus; return ncclSuccess; }