/* * Copyright (C) 2020 Advanced Micro Devices, Inc. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * */ #include "KFDSVMEvictTest.hpp" #include #include #include #include "PM4Queue.hpp" #include "PM4Packet.hpp" #include "SDMAPacket.hpp" #include "SDMAQueue.hpp" #include "Dispatch.hpp" #define N_PROCESSES (4) /* number of processes running in parallel, at least 2 */ #define ALLOCATE_BUF_SIZE_MB (64) #define ALLOCATE_RETRY_TIMES (3) HSAint32 KFDSVMEvictTest::GetBufferCounter(HSAuint64 vramSize, HSAuint64 vramBufSize) { HSAuint64 vramBufSizeInPages = vramBufSize >> PAGE_SHIFT; HSAuint64 sysMemSize = GetSysMemSize(); HSAuint64 size, sizeInPages; HSAuint32 count; LOG() << "Found System RAM of " << std::dec << (sysMemSize >> 20) << "MB" << std::endl; /* use one third of total system memory for eviction buffer to test * limit max allocate size to duoble of vramSize * count is zero if not enough memory (sysMemSize/3 + vramSize) < (vramBufSize * N_PROCESSES) */ size = sysMemSize / 3 + vramSize; size = size > vramSize << 1 ? vramSize << 1 : size; sizeInPages = size >> PAGE_SHIFT; count = sizeInPages / (vramBufSizeInPages * N_PROCESSES); return count; } void KFDSVMEvictTest::AllocBuffers(HSAuint32 defaultGPUNode, HSAuint32 count, HSAuint64 vramBufSize, std::vector &pBuffers) { HSAuint64 totalMB; totalMB = N_PROCESSES * count * (vramBufSize >> 20); if (m_IsParent) { LOG() << "Testing " << N_PROCESSES << "*" << count << "*" << (vramBufSize>>20) << "(="<< totalMB << ")MB" << std::endl; } HSAKMT_STATUS ret; HSAuint32 retry = 0; for (HSAuint32 i = 0; i < count; i++) { m_pBuf = mmap(0, vramBufSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); EXPECT_NOTNULL(m_pBuf); m_Flags = (HSA_SVM_FLAGS)0; retry: ret = RegisterSVMRange(defaultGPUNode, m_pBuf, vramBufSize, defaultGPUNode, m_Flags); if (ret == HSAKMT_STATUS_SUCCESS) { pBuffers.push_back(m_pBuf); retry = 0; } else { if (retry++ > ALLOCATE_RETRY_TIMES) { munmap(m_pBuf, vramBufSize); break; } printf("retry %d allocate vram\n", retry); /* wait for 1 second to try allocate again */ sleep(1); goto retry; } } } void KFDSVMEvictTest::FreeBuffers(std::vector &pBuffers, HSAuint64 vramBufSize) { for (HSAuint32 i = 0; i < pBuffers.size(); i++) { m_pBuf = pBuffers[i]; if (m_pBuf != NULL) munmap(m_pBuf, vramBufSize); } } void KFDSVMEvictTest::ForkChildProcesses(int nprocesses) { int i; for (i = 0; i < nprocesses - 1; ++i) { pid_t pid = fork(); ASSERT_GE(pid, 0); if (pid == 0) { /* Child process */ /* Cleanup file descriptors copied from parent process * then call SetUp->hsaKmtOpenKFD to create new process */ m_psName = "Test process " + std::to_string(i) + " "; TearDown(); SetUp(); m_ChildPids.clear(); m_IsParent = false; return; } /* Parent process */ m_ChildPids.push_back(pid); } m_psName = "Test process " + std::to_string(i) + " "; } void KFDSVMEvictTest::WaitChildProcesses() { if (m_IsParent) { /* only run by parent process */ int childStatus; int childExitOkNum = 0; int size = m_ChildPids.size(); for (HSAuint32 i = 0; i < size; i++) { pid_t pid = m_ChildPids.front(); waitpid(pid, &childStatus, 0); if (WIFEXITED(childStatus) == 1 && WEXITSTATUS(childStatus) == 0) childExitOkNum++; m_ChildPids.erase(m_ChildPids.begin()); } ASSERT_EQ(childExitOkNum, size); } /* child process or parent process finished successfullly */ m_ChildStatus = HSAKMT_STATUS_SUCCESS; } /* Evict and restore procedure basic test * * Use N_PROCESSES processes to allocate vram buf size larger than total vram size * * ALLOCATE_BUF_SIZE_MB buf allocation size * * number of buf is equal to (vramSizeMB / (vramBufSizeMB * N_PROCESSES) ) + 8 * Total vram all processes allocated: 8GB for 4GB Fiji, and 20GB for 16GB Vega10 * * many times of eviction and restore will happen: * ttm will evict buffers of another process if not enough free vram * process restore will evict buffers of another process * * Sometimes the allocate may fail (maybe that is normal) * ALLOCATE_RETRY_TIMES max retry times to allocate * * This is basic test, no queue so vram are not used by GPU during test * * Todo: * - Synchronization between the processes, so they know for sure when * they are done allocating memory */ TEST_F(KFDSVMEvictTest, BasicTest) { TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); TEST_START(TESTPROFILE_RUNALL); if (!SVMAPISupported()) return; HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; HSAuint64 vramBufSize = ALLOCATE_BUF_SIZE_MB * 1024 * 1024; HSAuint64 vramSize = GetVramSize(defaultGPUNode); if (!vramSize) { LOG() << "No VRAM found, skipping the test" << std::endl; return; } else { LOG() << "Found VRAM of " << std::dec << (vramSize >> 20) << "MB" << std::endl; } HSAuint32 count = GetBufferCounter(vramSize, vramBufSize); if (count == 0) { LOG() << "Not enough system memory, skipping the test" << std::endl; return; } /* Fork the child processes */ ForkChildProcesses(N_PROCESSES); std::vector pBuffers; AllocBuffers(defaultGPUNode, count, vramBufSize, pBuffers); /* wait for other processes to finish allocation, then free buffer */ sleep(ALLOCATE_RETRY_TIMES); LOG() << m_psName << "free buffer" << std::endl; FreeBuffers(pBuffers, vramBufSize); WaitChildProcesses(); TEST_END } /* Shader to read local buffers using multiple wavefronts in parallel * until address buffer is filled with specific value 0x5678 by host program, * then each wavefront fills value 0x5678 at corresponding result buffer and quit * * initial state: * s[0:1] - address buffer base address * s[2:3] - result buffer base address * s4 - workgroup id * v0 - workitem id, always 0 because NUM_THREADS_X(number of threads) in workgroup set to 1 * registers: * v0 - calculated workitem id, v0 = v0 + s4 * NUM_THREADS_X * v[2:3] - address of corresponding local buf address offset: s[0:1] + v0 * 8 * v[4:5] - corresponding output buf address: s[2:3] + v0 * 4 * v[6:7] - local buf address used for read test */ static const char* gfx9_ReadMemory = "\ shader ReadMemory\n\ type(CS)\n\ \n\ // compute address of corresponding output buffer\n\ v_mov_b32 v0, s4 // use workgroup id as index\n\ v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\ v_add_co_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\ v_mov_b32 v5, s3\n\ v_add_u32 v5, vcc_lo, v5\n\ \n\ // compute input buffer offset used to store corresponding local buffer address\n\ v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\ v_add_co_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\ v_mov_b32 v3, s1\n\ v_add_u32 v3, vcc_lo, v3\n\ \n\ // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\ flat_load_dwordx2 v[6:7], v[2:3] slc\n\ s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ \n\ v_mov_b32 v8, 0x5678\n\ s_movk_i32 s8, 0x5678\n\ L_REPEAT:\n\ s_load_dword s16, s[0:1], 0x0 glc\n\ s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ s_cmp_eq_i32 s16, s8\n\ s_cbranch_scc1 L_QUIT // if notified to quit by host\n\ // loop read 64M local buffer starting at v[6:7]\n\ // every 4k page only read once\n\ v_mov_b32 v9, 0\n\ v_mov_b32 v10, 0x1000 // 4k page\n\ v_mov_b32 v11, 0x4000000 // 64M size\n\ v_mov_b32 v12, v6\n\ v_mov_b32 v13, v7\n\ L_LOOP_READ:\n\ flat_load_dwordx2 v[14:15], v[12:13] slc\n\ v_add_u32 v9, v9, v10 \n\ v_add_co_u32 v12, vcc, v12, v10\n\ v_add_u32 v13, vcc_lo, v13\n\ v_cmp_lt_u32 vcc, v9, v11\n\ s_cbranch_vccnz L_LOOP_READ\n\ s_branch L_REPEAT\n\ L_QUIT:\n\ flat_store_dword v[4:5], v8\n\ s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\ s_endpgm\n\ end\n\ "; static const char* gfx8_ReadMemory = "\ shader ReadMemory\n\ asic(VI)\n\ type(CS)\n\ \n\ // compute address of corresponding output buffer\n\ v_mov_b32 v0, s4 // use workgroup id as index\n\ v_lshlrev_b32 v0, 2, v0 // v0 *= 4\n\ v_add_u32 v4, vcc, s2, v0 // v[4:5] = s[2:3] + v0 * 4\n\ v_mov_b32 v5, s3\n\ v_addc_u32 v5, vcc, v5, 0, vcc\n\ \n\ // compute input buffer offset used to store corresponding local buffer address\n\ v_lshlrev_b32 v0, 1, v0 // v0 *= 8\n\ v_add_u32 v2, vcc, s0, v0 // v[2:3] = s[0:1] + v0 * 8\n\ v_mov_b32 v3, s1\n\ v_addc_u32 v3, vcc, v3, 0, vcc\n\ \n\ // load 64bit local buffer address stored at v[2:3] to v[6:7]\n\ flat_load_dwordx2 v[6:7], v[2:3] slc\n\ s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ \n\ v_mov_b32 v8, 0x5678\n\ s_movk_i32 s8, 0x5678\n\ L_REPEAT:\n\ s_load_dword s16, s[0:1], 0x0 glc\n\ s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory reads to finish\n\ s_cmp_eq_i32 s16, s8\n\ s_cbranch_scc1 L_QUIT // if notified to quit by host\n\ // loop read 64M local buffer starting at v[6:7]\n\ // every 4k page only read once\n\ v_mov_b32 v9, 0\n\ v_mov_b32 v10, 0x1000 // 4k page\n\ v_mov_b32 v11, 0x4000000 // 64M size\n\ v_mov_b32 v12, v6\n\ v_mov_b32 v13, v7\n\ L_LOOP_READ:\n\ flat_load_dwordx2 v[14:15], v[12:13] slc\n\ v_add_u32 v9, vcc, v9, v10 \n\ v_add_u32 v12, vcc, v12, v10\n\ v_addc_u32 v13, vcc, v13, 0, vcc\n\ v_cmp_lt_u32 vcc, v9, v11\n\ s_cbranch_vccnz L_LOOP_READ\n\ s_branch L_REPEAT\n\ L_QUIT:\n\ flat_store_dword v[4:5], v8\n\ s_waitcnt vmcnt(0) & lgkmcnt(0) // wait for memory writes to finish\n\ s_endpgm\n\ end\n\ "; std::string KFDSVMEvictTest::CreateShader() { if (m_FamilyId >= FAMILY_AI) return gfx9_ReadMemory; else return gfx8_ReadMemory; } /* Evict and restore queue test * * N_PROCESSES processes read all local buffers in parallel while buffers are evicted and restored * If GPU vm page fault happens, then test shader will stop and failed to write specific value * at dest buffer. Test will report failed. * * Steps: * - fork N_PROCESSES processes, each process does the same below * - allocate local buffers, each buffer size is 64MB * - allocate zero initialized host access address buffer and result buffer * address buffer to pass address of local buffers to shader * result buffer to store shader output result * - submit queue to run ReadMemory shader * - shader start m_DimX wavefronts, each wavefront keep reading one local buffer * - notify shader to quit * - check result buffer with specific value to confirm all wavefronts quit normally */ TEST_F(KFDSVMEvictTest, QueueTest) { TEST_REQUIRE_ENV_CAPABILITIES(ENVCAPS_64BITLINUX); TEST_START(TESTPROFILE_RUNALL) if (!SVMAPISupported()) return; HSAuint32 defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode(); ASSERT_GE(defaultGPUNode, 0) << "failed to get default GPU Node"; HSAuint64 vramBufSize = ALLOCATE_BUF_SIZE_MB * 1024 * 1024; const HsaNodeProperties *pNodeProperties = m_NodeInfo.HsaDefaultGPUNodeProperties(); /* Skip test for chip it doesn't have CWSR, which the test depends on */ if (m_FamilyId < FAMILY_VI || isTonga(pNodeProperties) || m_FamilyId >= FAMILY_NV) { LOG() << std::hex << "Test is skipped for family ID 0x" << m_FamilyId << std::endl; return; } HSAuint32 i; HSAuint64 vramSize = GetVramSize(defaultGPUNode); if (!vramSize) { LOG() << "No VRAM found, skipping the test" << std::endl; return; } else { LOG() << "Found VRAM of " << std::dec << (vramSize >> 20) << "MB." << std::endl; } HSAuint32 count = GetBufferCounter(vramSize, vramBufSize); if (count == 0) { LOG() << "Not enough system memory, skipping the test" << std::endl; return; } /* assert all buffer address can be stored within one page * because only one page host memory srcBuf is allocated */ ASSERT_LE(count, PAGE_SIZE/sizeof(unsigned int *)); /* Fork the child processes */ ForkChildProcesses(N_PROCESSES); HsaMemoryBuffer isaBuffer(PAGE_SIZE, defaultGPUNode, true/*zero*/, false/*local*/, true/*exec*/); HsaMemoryBuffer addrBuffer(PAGE_SIZE, defaultGPUNode); HsaMemoryBuffer resultBuffer(PAGE_SIZE, defaultGPUNode); std::vector pBuffers; AllocBuffers(defaultGPUNode, count, vramBufSize, pBuffers); unsigned int wavefront_num = pBuffers.size(); LOG() << m_psName << "wavefront number " << wavefront_num << std::endl; void **localBufAddr = addrBuffer.As(); unsigned int *result = resultBuffer.As(); for (i = 0; i < wavefront_num; i++) *(localBufAddr + i) = pBuffers[i]; m_pIsaGen->CompileShader(CreateShader().c_str(), "ReadMemory", isaBuffer); PM4Queue pm4Queue; ASSERT_SUCCESS(pm4Queue.Create(defaultGPUNode)); Dispatch dispatch0(isaBuffer); dispatch0.SetArgs(localBufAddr, result); dispatch0.SetDim(wavefront_num, 1, 1); /* submit the packet and start shader */ dispatch0.Submit(pm4Queue); /* doing evict/restore queue test for 5 seconds while queue is running */ sleep(5); /* LOG() << m_psName << "notify shader to quit" << std::endl; */ /* fill address buffer so shader quits */ addrBuffer.Fill(0x5678); /* wait for shader to finish or timeout if shade has vm page fault */ dispatch0.SyncWithStatus(120000); ASSERT_SUCCESS(pm4Queue.Destroy()); /* LOG() << m_psName << "free buffer" << std::endl; */ /* cleanup */ FreeBuffers(pBuffers, vramBufSize); /* check if all wavefronts finish successfully */ for (i = 0; i < wavefront_num; i++) ASSERT_EQ(0x5678, *(result + i)); WaitChildProcesses(); TEST_END }