//////////////////////////////////////////////////////////////////////////////// // // The University of Illinois/NCSA // Open Source License (NCSA) // // Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. // // Developed by: // // AMD Research and AMD HSA Software Development // // Advanced Micro Devices, Inc. // // www.amd.com // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: // // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimers in // the documentation and/or other materials provided with the distribution. // - Neither the names of Advanced Micro Devices, Inc, // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS WITH THE SOFTWARE. // //////////////////////////////////////////////////////////////////////////////// #include "core/inc/amd_memory_region.h" #include #include #include "core/inc/runtime.h" #include "core/inc/amd_cpu_agent.h" #include "core/inc/amd_gpu_agent.h" #include "core/util/utils.h" #include "core/inc/exceptions.h" namespace rocr { namespace AMD { // Tracks aggregate size of system memory available on platform size_t MemoryRegion::max_sysmem_alloc_size_ = 0; void* MemoryRegion::AllocateKfdMemory(const HsaMemFlags& flag, HSAuint32 node_id, size_t size) { void* ret = NULL; const HSAKMT_STATUS status = hsaKmtAllocMemory(node_id, size, flag, &ret); return (status == HSAKMT_STATUS_SUCCESS) ? ret : NULL; } void MemoryRegion::FreeKfdMemory(void* ptr, size_t size) { if (ptr == NULL || size == 0) { return; } HSAKMT_STATUS status = hsaKmtFreeMemory(ptr, size); assert(status == HSAKMT_STATUS_SUCCESS); } bool MemoryRegion::RegisterMemory(void* ptr, size_t size, const HsaMemFlags& MemFlags) { assert(ptr != NULL); assert(size != 0); const HSAKMT_STATUS status = hsaKmtRegisterMemoryWithFlags(ptr, size, MemFlags); return (status == HSAKMT_STATUS_SUCCESS); } void MemoryRegion::DeregisterMemory(void* ptr) { hsaKmtDeregisterMemory(ptr); } bool MemoryRegion::MakeKfdMemoryResident(size_t num_node, const uint32_t* nodes, const void* ptr, size_t size, uint64_t* alternate_va, HsaMemMapFlags map_flag) { assert(num_node > 0); assert(nodes != NULL); *alternate_va = 0; const HSAKMT_STATUS status = hsaKmtMapMemoryToGPUNodes( const_cast(ptr), size, alternate_va, map_flag, num_node, const_cast(nodes)); return (status == HSAKMT_STATUS_SUCCESS); } void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) { hsaKmtUnmapMemoryToGPU(const_cast(ptr)); } MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner, const HsaMemoryProperties& mem_props) : core::MemoryRegion(fine_grain, kernarg, full_profile, owner), mem_props_(mem_props), max_single_alloc_size_(0), virtual_size_(0), fragment_allocator_(BlockAllocator(*this)) { virtual_size_ = GetPhysicalSize(); mem_flag_.Value = 0; map_flag_.Value = 0; static const HSAuint64 kGpuVmSize = (1ULL << 40); if (IsLocalMemory()) { mem_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB; mem_flag_.ui32.NoSubstitute = 1; mem_flag_.ui32.HostAccess = (mem_props_.HeapType == HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE) ? 0 : 1; mem_flag_.ui32.NonPaged = 1; virtual_size_ = kGpuVmSize; } else if (IsSystem()) { mem_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB; mem_flag_.ui32.NoSubstitute = 0; mem_flag_.ui32.HostAccess = 1; mem_flag_.ui32.CachePolicy = HSA_CACHING_CACHED; if (kernarg) mem_flag_.ui32.Uncached = 1; virtual_size_ = (full_profile) ? os::GetUserModeVirtualMemorySize() : kGpuVmSize; } // Bind if memory region is coarse or fine grain mem_flag_.ui32.CoarseGrain = (fine_grain) ? 0 : 1; // Adjust allocatable size per page align max_single_alloc_size_ = AlignDown(static_cast(GetPhysicalSize()), kPageSize_); // Keep track of total system memory available // @note: System memory is surfaced as both coarse // and fine grain memory regions. To track total system // memory only fine grain is considered as it avoids // double counting if (IsSystem() && (fine_grain)) { max_sysmem_alloc_size_ += max_single_alloc_size_; } assert(GetVirtualSize() != 0); assert(GetPhysicalSize() <= GetVirtualSize()); assert(IsMultipleOf(max_single_alloc_size_, kPageSize_)); } MemoryRegion::~MemoryRegion() {} hsa_status_t MemoryRegion::Allocate(size_t& size, AllocateFlags alloc_flags, void** address) const { ScopedAcquire lock(&owner()->agent_memory_lock_); return AllocateImpl(size, alloc_flags, address); } hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags, void** address) const { if (address == NULL) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } if (!IsSystem() && !IsLocalMemory()) { return HSA_STATUS_ERROR_INVALID_ALLOCATION; } // Alocation requests for system memory considers aggregate // memory available on all CPU devices if (size > ((IsSystem() ? max_sysmem_alloc_size_ : max_single_alloc_size_))) { return HSA_STATUS_ERROR_INVALID_ALLOCATION; } size = AlignUp(size, kPageSize_); HsaMemFlags kmt_alloc_flags(mem_flag_); kmt_alloc_flags.ui32.ExecuteAccess = (alloc_flags & AllocateExecutable ? 1 : 0); kmt_alloc_flags.ui32.AQLQueueMemory = (alloc_flags & AllocateDoubleMap ? 1 : 0); if (IsSystem() && (alloc_flags & AllocateIPC)) kmt_alloc_flags.ui32.NonPaged = 1; // Only allow using the suballocator for ordinary VRAM. if (IsLocalMemory()) { bool subAllocEnabled = !core::Runtime::runtime_singleton_->flag().disable_fragment_alloc(); // Avoid modifying executable or queue allocations. bool useSubAlloc = subAllocEnabled; useSubAlloc &= ((alloc_flags & (~AllocateRestrict)) == 0); if (useSubAlloc) { *address = fragment_allocator_.alloc(size); return HSA_STATUS_SUCCESS; } } // Allocate memory. // If it fails attempt to release memory from the block allocator and retry. *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size); if (*address == nullptr) { owner()->Trim(); *address = AllocateKfdMemory(kmt_alloc_flags, owner()->node_id(), size); } if (*address != nullptr) { // Commit the memory. // For system memory, on non-restricted allocation, map it to all GPUs. On // restricted allocation, only CPU is allowed to access by default, so // no need to map // For local memory, only map it to the owning GPU. Mapping to other GPU, // if the access is allowed, is performed on AllowAccess. HsaMemMapFlags map_flag = map_flag_; size_t map_node_count = 1; const uint32_t owner_node_id = owner()->node_id(); const uint32_t* map_node_id = &owner_node_id; if (IsSystem()) { if ((alloc_flags & AllocateRestrict) == 0) { // Map to all GPU agents. map_node_count = core::Runtime::runtime_singleton_->gpu_ids().size(); if (map_node_count == 0) { // No need to pin since no GPU in the platform. return HSA_STATUS_SUCCESS; } map_node_id = &core::Runtime::runtime_singleton_->gpu_ids()[0]; } else { // No need to pin it for CPU exclusive access. return HSA_STATUS_SUCCESS; } } uint64_t alternate_va = 0; const bool is_resident = MakeKfdMemoryResident( map_node_count, map_node_id, *address, size, &alternate_va, map_flag); const bool require_pinning = (!full_profile() || IsLocalMemory() || IsScratch()); if (require_pinning && !is_resident) { FreeKfdMemory(*address, size); *address = NULL; return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } return HSA_STATUS_SUCCESS; } return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } hsa_status_t MemoryRegion::Free(void* address, size_t size) const { ScopedAcquire lock(&owner()->agent_memory_lock_); return FreeImpl(address, size); } hsa_status_t MemoryRegion::FreeImpl(void* address, size_t size) const { if (fragment_allocator_.free(address)) return HSA_STATUS_SUCCESS; MakeKfdMemoryUnresident(address); FreeKfdMemory(address, size); return HSA_STATUS_SUCCESS; } // TODO: Look into a better name and/or making this process transparent to exporting. hsa_status_t MemoryRegion::IPCFragmentExport(void* address) const { ScopedAcquire lock(&owner()->agent_memory_lock_); if (!fragment_allocator_.discardBlock(address)) return HSA_STATUS_ERROR_INVALID_ALLOCATION; return HSA_STATUS_SUCCESS; } hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute, void* value) const { switch (attribute) { case HSA_REGION_INFO_SEGMENT: switch (mem_props_.HeapType) { case HSA_HEAPTYPE_SYSTEM: case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: *((hsa_region_segment_t*)value) = HSA_REGION_SEGMENT_GLOBAL; break; case HSA_HEAPTYPE_GPU_LDS: *((hsa_region_segment_t*)value) = HSA_REGION_SEGMENT_GROUP; break; default: assert(false && "Memory region should only be global, group"); break; } break; case HSA_REGION_INFO_GLOBAL_FLAGS: switch (mem_props_.HeapType) { case HSA_HEAPTYPE_SYSTEM: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: { uint32_t ret = fine_grain() ? HSA_REGION_GLOBAL_FLAG_FINE_GRAINED : HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED; if (kernarg()) ret |= HSA_REGION_GLOBAL_FLAG_KERNARG; *((uint32_t*)value) = ret; break; } default: *((uint32_t*)value) = 0; break; } break; case HSA_REGION_INFO_SIZE: *((size_t*)value) = static_cast(GetPhysicalSize()); break; case HSA_REGION_INFO_ALLOC_MAX_SIZE: switch (mem_props_.HeapType) { case HSA_HEAPTYPE_SYSTEM: *((size_t*)value) = max_sysmem_alloc_size_; break; case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: case HSA_HEAPTYPE_GPU_SCRATCH: *((size_t*)value) = max_single_alloc_size_; break; default: *((size_t*)value) = 0; } break; case HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: switch (mem_props_.HeapType) { case HSA_HEAPTYPE_SYSTEM: case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: *((bool*)value) = true; break; default: *((bool*)value) = false; break; } break; case HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: switch (mem_props_.HeapType) { case HSA_HEAPTYPE_SYSTEM: case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: *((size_t*)value) = kPageSize_; break; default: *((size_t*)value) = 0; break; } break; case HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: switch (mem_props_.HeapType) { case HSA_HEAPTYPE_SYSTEM: case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: *((size_t*)value) = kPageSize_; break; default: *((size_t*)value) = 0; break; } break; default: switch ((hsa_amd_region_info_t)attribute) { case HSA_AMD_REGION_INFO_HOST_ACCESSIBLE: *((bool*)value) = (mem_props_.HeapType == HSA_HEAPTYPE_SYSTEM) ? true : false; break; case HSA_AMD_REGION_INFO_BASE: *((void**)value) = reinterpret_cast(GetBaseAddress()); break; case HSA_AMD_REGION_INFO_BUS_WIDTH: *((uint32_t*)value) = BusWidth(); break; case HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY: *((uint32_t*)value) = MaxMemCloc(); break; default: return HSA_STATUS_ERROR_INVALID_ARGUMENT; break; } break; } return HSA_STATUS_SUCCESS; } hsa_status_t MemoryRegion::GetPoolInfo(hsa_amd_memory_pool_info_t attribute, void* value) const { switch (attribute) { case HSA_AMD_MEMORY_POOL_INFO_SEGMENT: case HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS: case HSA_AMD_MEMORY_POOL_INFO_SIZE: case HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED: case HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE: case HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT: return GetInfo(static_cast(attribute), value); case HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL: *((bool*)value) = IsSystem() ? true : false; break; case HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE: switch (mem_props_.HeapType) { case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: case HSA_HEAPTYPE_GPU_SCRATCH: return GetInfo(HSA_REGION_INFO_ALLOC_MAX_SIZE, value); case HSA_HEAPTYPE_SYSTEM: // Aggregate size available for allocation *((size_t*)value) = max_sysmem_alloc_size_; break; default: *((size_t*)value) = 0; } break; default: return HSA_STATUS_ERROR_INVALID_ARGUMENT; } return HSA_STATUS_SUCCESS; } hsa_amd_memory_pool_access_t MemoryRegion::GetAccessInfo( const core::Agent& agent, const core::Runtime::LinkInfo& link_info) const { // Return allowed by default if memory pool is owned by requesting device if (agent.public_handle().handle == owner()->public_handle().handle) { return HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT; } // Requesting device does not have a link if (link_info.num_hop < 1) { return HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; } // Determine access to fine and coarse grained system memory // Return allowed by default if requesting device is a CPU // Return disallowed by default if requesting device is not a CPU if (IsSystem()) { return (agent.device_type() == core::Agent::kAmdCpuDevice) ? (HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT) : (HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT); } // Determine access type for device local memory which is // guaranteed to be HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC // Return disallowed by default if framebuffer is coarse grained // without regard to type of requesting device (CPU / GPU) // Return disallowed by default if framebuffer is fine grained // and requesting device is connected via xGMI link // Return never allowed if framebuffer is fine grained and // requesting device is connected via PCIe link if (IsLocalMemory()) { // Return disallowed by default if memory is coarse // grained without regard to link type if (fine_grain() == false) { return HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT; } // Determine if pool is pseudo fine-grained due to env flag // Return disallowed by default if (core::Runtime::runtime_singleton_->flag().fine_grain_pcie()) { return HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT; } // Return disallowed by default if memory is fine // grained and link type is xGMI. if (agent.HiveId() == owner()->HiveId()) { return HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT; } // Return never allowed if memory is fine grained // link type is not xGMI i.e. link is PCIe return HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; } // Return never allowed if above conditions are not satisified // This can happen when memory pool references neither system // or device local memory return HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; } hsa_status_t MemoryRegion::GetAgentPoolInfo( const core::Agent& agent, hsa_amd_agent_memory_pool_info_t attribute, void* value) const { const uint32_t node_id_from = agent.node_id(); const uint32_t node_id_to = owner()->node_id(); const core::Runtime::LinkInfo link_info = core::Runtime::runtime_singleton_->GetLinkInfo(node_id_from, node_id_to); const hsa_amd_memory_pool_access_t access_type = GetAccessInfo(agent, link_info); switch (attribute) { case HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS: *((hsa_amd_memory_pool_access_t*)value) = access_type; break; case HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS: *((uint32_t*)value) = (access_type != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) ? link_info.num_hop : 0; break; case HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO: memset(value, 0, sizeof(hsa_amd_memory_pool_link_info_t)); if ((access_type != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) && (link_info.num_hop > 0)) { memcpy(value, &link_info.info, sizeof(hsa_amd_memory_pool_link_info_t)); } break; default: return HSA_STATUS_ERROR_INVALID_ARGUMENT; } return HSA_STATUS_SUCCESS; } hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents, const hsa_agent_t* agents, const void* ptr, size_t size) const { if (num_agents == 0 || agents == NULL || ptr == NULL || size == 0) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } if (!IsSystem() && !IsLocalMemory()) { return HSA_STATUS_ERROR; } // Adjust for fragments. Make accessibility sticky for fragments since this will satisfy the // union of accessible agents between the fragments in the block. hsa_amd_pointer_info_t info; uint32_t agent_count = 0; hsa_agent_t* accessible = nullptr; MAKE_SCOPE_GUARD([&]() { free(accessible); }); core::Runtime::PtrInfoBlockData blockInfo; std::vector union_agents; info.size = sizeof(info); ScopedAcquire lock(&access_lock_); if (core::Runtime::runtime_singleton_->PtrInfo(const_cast(ptr), &info, malloc, &agent_count, &accessible, &blockInfo) == HSA_STATUS_SUCCESS) { if (blockInfo.length != size || info.sizeInBytes != size) { for (int i = 0; i < num_agents; i++) union_agents.push_back(agents[i].handle); for (int i = 0; i < agent_count; i++) union_agents.push_back(accessible[i].handle); std::sort(union_agents.begin(), union_agents.end()); const auto& last = std::unique(union_agents.begin(), union_agents.end()); union_agents.erase(last, union_agents.end()); agents = reinterpret_cast(&union_agents[0]); num_agents = union_agents.size(); size = blockInfo.length; ptr = blockInfo.base; } } bool cpu_in_list = false; std::set whitelist_gpus; std::vector whitelist_nodes; for (uint32_t i = 0; i < num_agents; ++i) { core::Agent* agent = core::Agent::Convert(agents[i]); if (agent == NULL || !agent->IsValid()) { return HSA_STATUS_ERROR_INVALID_AGENT; } if (agent->device_type() == core::Agent::kAmdGpuDevice) { whitelist_nodes.push_back(agent->node_id()); whitelist_gpus.insert(reinterpret_cast(agent)); } else { cpu_in_list = true; } } if (whitelist_nodes.size() == 0 && IsSystem()) { assert(cpu_in_list); // This is a system region and only CPU agents in the whitelist. // Remove old mappings. AMD::MemoryRegion::MakeKfdMemoryUnresident(ptr); return HSA_STATUS_SUCCESS; } // If this is a local memory region, the owning gpu always needs to be in // the whitelist. if (IsLocalMemory() && std::find(whitelist_nodes.begin(), whitelist_nodes.end(), owner()->node_id()) == whitelist_nodes.end()) { whitelist_nodes.push_back(owner()->node_id()); whitelist_gpus.insert(reinterpret_cast(owner())); } HsaMemMapFlags map_flag = map_flag_; map_flag.ui32.HostAccess |= (cpu_in_list) ? 1 : 0; { // Sequence with pointer info since queries to other fragments of the block may be adjusted by // this call. ScopedAcquire lock( core::Runtime::runtime_singleton_->memory_lock_.shared()); uint64_t alternate_va = 0; if (!AMD::MemoryRegion::MakeKfdMemoryResident( whitelist_nodes.size(), &whitelist_nodes[0], ptr, size, &alternate_va, map_flag)) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } } return HSA_STATUS_SUCCESS; } hsa_status_t MemoryRegion::CanMigrate(const MemoryRegion& dst, bool& result) const { // TODO: not implemented yet. result = false; return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } hsa_status_t MemoryRegion::Migrate(uint32_t flag, const void* ptr) const { // TODO: not implemented yet. return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents, void* host_ptr, size_t size, void** agent_ptr) const { if (!IsSystem()) { return HSA_STATUS_ERROR; } if (full_profile()) { // For APU, any host pointer is always accessible by the gpu. *agent_ptr = host_ptr; return HSA_STATUS_SUCCESS; } std::set whitelist_gpus; std::vector whitelist_nodes; if (num_agents == 0 || agents == NULL) { // Map to all GPU agents. whitelist_nodes = core::Runtime::runtime_singleton_->gpu_ids(); whitelist_gpus.insert( core::Runtime::runtime_singleton_->gpu_agents().begin(), core::Runtime::runtime_singleton_->gpu_agents().end()); } else { for (uint32_t i = 0; i < num_agents; ++i) { core::Agent* agent = core::Agent::Convert(agents[i]); if (agent == NULL || !agent->IsValid()) { return HSA_STATUS_ERROR_INVALID_AGENT; } if (agent->device_type() == core::Agent::kAmdGpuDevice) { whitelist_nodes.push_back(agent->node_id()); whitelist_gpus.insert(agent); } } } if (whitelist_nodes.size() == 0) { // No GPU agents in the whitelist. So no need to register and map since the // platform only has CPUs. *agent_ptr = host_ptr; return HSA_STATUS_SUCCESS; } // Call kernel driver to register and pin the memory. if (RegisterMemory(host_ptr, size, mem_flag_)) { uint64_t alternate_va = 0; if (MakeKfdMemoryResident(whitelist_nodes.size(), &whitelist_nodes[0], host_ptr, size, &alternate_va, map_flag_)) { if (alternate_va != 0) { *agent_ptr = reinterpret_cast(alternate_va); } else { *agent_ptr = host_ptr; } return HSA_STATUS_SUCCESS; } AMD::MemoryRegion::DeregisterMemory(host_ptr); return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } return HSA_STATUS_ERROR; } hsa_status_t MemoryRegion::Unlock(void* host_ptr) const { if (!IsSystem()) { return HSA_STATUS_ERROR; } if (full_profile()) { return HSA_STATUS_SUCCESS; } MakeKfdMemoryUnresident(host_ptr); DeregisterMemory(host_ptr); return HSA_STATUS_SUCCESS; } hsa_status_t MemoryRegion::AssignAgent(void* ptr, size_t size, const core::Agent& agent, hsa_access_permission_t access) const { return HSA_STATUS_SUCCESS; } void MemoryRegion::Trim() const { fragment_allocator_.trim(); } void* MemoryRegion::BlockAllocator::alloc(size_t request_size, size_t& allocated_size) const { void* ret; size_t bsize = AlignUp(request_size, block_size()); hsa_status_t err = region_.AllocateImpl( bsize, core::MemoryRegion::AllocateRestrict | core::MemoryRegion::AllocateDirect, &ret); if (err != HSA_STATUS_SUCCESS) throw AMD::hsa_exception(err, "MemoryRegion::BlockAllocator::alloc failed."); assert(ret != nullptr && "Region returned nullptr on success."); allocated_size = bsize; return ret; } } // namespace amd } // namespace rocr