/*
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

/**
 * @file hip_hcc.cpp
 *
 * Contains definitions for functions that are large enough that we don't want to inline them
 * everywhere. This file is compiled and linked into apps running HIP / HCC path.
 */
#include <assert.h>
#include <exception>
#include <stdint.h>
#include <iostream>
#include <sstream>
#include <list>
#include <sys/types.h>
#include <unistd.h>
#include <deque>
#include <vector>
#include <algorithm>
#include <atomic>
#include <mutex>

#include <hc.hpp>
#include <hc_am.hpp>
#include "hsa/hsa_ext_amd.h"
#include "hsa/hsa_ext_image.h"
#include "hip/hip_runtime.h"
#include "hip_hcc_internal.h"
#include "hip/hip_hcc.h"
#include "trace_helper.h"
#include "env.h"

// TODO - create a stream-based debug interface as an additional option for tprintf
#define DB_PEER_CTX 0


//=================================================================================================
// Global variables:
//=================================================================================================
const int release = 1;

const char* API_COLOR = KGRN;
const char* API_COLOR_END = KNRM;

int HIP_LAUNCH_BLOCKING = 0;
std::string HIP_LAUNCH_BLOCKING_KERNELS;
std::vector<std::string> g_hipLaunchBlockingKernels;
int HIP_API_BLOCKING = 0;


int HIP_PRINT_ENV = 0;
int HIP_TRACE_API = 0;
std::string HIP_TRACE_API_COLOR("green");
int HIP_PROFILE_API = 0;

// TODO - DB_START/STOP need more testing.
std::string HIP_DB_START_API;
std::string HIP_DB_STOP_API;
int HIP_DB = 0;
int HIP_VISIBLE_DEVICES = 0;
int HIP_WAIT_MODE = 0;

int HIP_FORCE_P2P_HOST = 0;
int HIP_FAIL_SOC = 0;
int HIP_DENY_PEER_ACCESS = 0;

int HIP_HIDDEN_FREE_MEM = 256;
// Force async copies to actually use the synchronous copy interface.
int HIP_FORCE_SYNC_COPY = 0;

// TODO - set these to 0 and 1
int HIP_EVENT_SYS_RELEASE = 0;
int HIP_HOST_COHERENT = 1;

int HIP_SYNC_HOST_ALLOC = 1;
int HIP_SYNC_FREE = 0;


int HIP_INIT_ALLOC = -1;
int HIP_SYNC_STREAM_WAIT = 0;
int HIP_FORCE_NULL_STREAM = 0;

int HIP_DUMP_CODE_OBJECT = 0;


#if (__hcc_workweek__ >= 17300)
// Make sure we have required bug fix in HCC
// Perform resolution on the GPU:
// Chicken bit to sync on host to implement null stream.
// If 0, null stream synchronization is performed on the GPU
int HIP_SYNC_NULL_STREAM = 0;
#else
int HIP_SYNC_NULL_STREAM = 1;
#endif

// HIP needs to change some behavior based on HCC_OPT_FLUSH :
#if (__hcc_workweek__ >= 17296)
int HCC_OPT_FLUSH = 1;
#else
#warning "HIP disabled HCC_OPT_FLUSH since HCC version does not yet support"
int HCC_OPT_FLUSH = 0;
#endif

// Array of pointers to devices.
ihipDevice_t** g_deviceArray;


bool g_visible_device = false;
unsigned g_deviceCnt;
std::vector<int> g_hip_visible_devices;
hsa_agent_t g_cpu_agent;
hsa_agent_t* g_allAgents;  // CPU agents + all the visible GPU agents.
unsigned g_numLogicalThreads;
bool g_initDeviceFound = false;

std::atomic<int> g_lastShortTid(1);

// Indexed by short-tid:
//
std::vector<ProfTrigger> g_dbStartTriggers;
std::vector<ProfTrigger> g_dbStopTriggers;

//=================================================================================================
// Top-level "free" functions:
//=================================================================================================
uint64_t recordApiTrace(TlsData *tls, std::string* fullStr, const std::string& apiStr) {
    auto apiSeqNum = tls->tidInfo.apiSeqNum();
    auto tid = tls->tidInfo.tid();

    if ((tid < g_dbStartTriggers.size()) && (apiSeqNum >= g_dbStartTriggers[tid].nextTrigger())) {
        printf("info: resume profiling at %lu\n", apiSeqNum);
        RESUME_PROFILING;
        g_dbStartTriggers.pop_back();
    };
    if ((tid < g_dbStopTriggers.size()) && (apiSeqNum >= g_dbStopTriggers[tid].nextTrigger())) {
        printf("info: stop profiling at %lu\n", apiSeqNum);
        STOP_PROFILING;
        g_dbStopTriggers.pop_back();
    };

    fullStr->reserve(16 + apiStr.length());
    *fullStr = std::to_string(tid) + ".";
    *fullStr += std::to_string(apiSeqNum);
    *fullStr += " ";
    *fullStr += apiStr;

    uint64_t apiStartTick = getTicks();


    if (COMPILE_HIP_DB && HIP_TRACE_API) {
        fprintf(stderr, "%s<<hip-api pid:%d tid:%s @%lu%s\n", API_COLOR, tls->tidInfo.pid(), fullStr->c_str(), apiStartTick,
                API_COLOR_END);
    }

    return apiStartTick;
}


static inline bool ihipIsValidDevice(unsigned deviceIndex) {
    // deviceIndex is unsigned so always > 0
    return (deviceIndex < g_deviceCnt);
}


ihipDevice_t* ihipGetDevice(int deviceIndex) {
    if (ihipIsValidDevice(deviceIndex)) {
        return g_deviceArray[deviceIndex];
    } else {
        return NULL;
    }
}

ihipCtx_t* ihipGetPrimaryCtx(unsigned deviceIndex) {
    ihipDevice_t* device = ihipGetDevice(deviceIndex);
    return device ? device->getPrimaryCtx() : NULL;
};

hipError_t ihipSynchronize(TlsData *tls) {
    ihipGetTlsDefaultCtx()->locked_waitAllStreams();  // ignores non-blocking streams, this waits
                                                      // for all activity to finish.

    return (hipSuccess);
}

TlsData* tls_get_ptr() {
    static thread_local TlsData data;
    return &data;
}

//=================================================================================================
// ihipStream_t:
//=================================================================================================
TidInfo::TidInfo() : _apiSeqNum(0) {
    _shortTid = g_lastShortTid.fetch_add(1);
    _pid = getpid();

    if (COMPILE_HIP_DB && HIP_TRACE_API) {
        std::stringstream tid_ss;
        std::stringstream tid_ss_num;
        tid_ss_num << std::this_thread::get_id();
        tid_ss << std::hex << std::stoull(tid_ss_num.str());

        // cannot use tprintf here since it will recurse back into TlsData constructor
#if COMPILE_HIP_DB
        if (HIP_DB & (1 << DB_API)) {
            char msgStr[1000];
            snprintf(msgStr, sizeof(msgStr),
                    "HIP initialized short_tid#%d (maps to full_tid: 0x%s)\n",
                    tid(), tid_ss.str().c_str());
            fprintf(stderr, "  %ship-%s pid:%d tid:%d:%s%s", dbName[DB_API]._color,
                    dbName[DB_API]._shortName, pid(), tid(), msgStr, KNRM);
        }
#endif
    };
}

//=================================================================================================
// ihipStream_t:
//=================================================================================================
//---
ihipStream_t::ihipStream_t(ihipCtx_t* ctx, hc::accelerator_view av, unsigned int flags)
    : _id(0),  // will be set by add function.
      _flags(flags),
      _ctx(ctx),
      _criticalData(this, av) {
    unsigned schedBits = ctx->_ctxFlags & hipDeviceScheduleMask;

    switch (schedBits) {
        case hipDeviceScheduleAuto:
            _scheduleMode = Auto;
            break;
        case hipDeviceScheduleSpin:
            _scheduleMode = Spin;
            break;
        case hipDeviceScheduleYield:
            _scheduleMode = Yield;
            break;
        case hipDeviceScheduleBlockingSync:
            _scheduleMode = Yield;
            break;
        default:
            _scheduleMode = Auto;
    };
};


//---
ihipStream_t::~ihipStream_t() {}


hc::hcWaitMode ihipStream_t::waitMode() const {
    hc::hcWaitMode waitMode = hc::hcWaitModeActive;

    if (_scheduleMode == Auto) {
        if (g_deviceCnt > g_numLogicalThreads) {
            waitMode = hc::hcWaitModeActive;
        } else {
            waitMode = hc::hcWaitModeBlocked;
        }
    } else if (_scheduleMode == Spin) {
        waitMode = hc::hcWaitModeActive;
    } else if (_scheduleMode == Yield) {
        waitMode = hc::hcWaitModeBlocked;
    } else {
        assert(0);  // bad wait mode.
    }

    if (HIP_WAIT_MODE == 1) {
        waitMode = hc::hcWaitModeBlocked;
    } else if (HIP_WAIT_MODE == 2) {
        waitMode = hc::hcWaitModeActive;
    }

    return waitMode;
}

// Wait for all kernel and data copy commands in this stream to complete.
// This signature should be used in routines that already have locked the stream mutex
void ihipStream_t::wait(LockedAccessor_StreamCrit_t& crit) {
    tprintf(DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str());

    crit->_av.wait(waitMode());
}

//---
// Wait for all kernel and data copy commands in this stream to complete.
void ihipStream_t::locked_wait() {
    // create a marker while holding stream lock,
    // but release lock prior to waiting on the marker
    hc::completion_future marker;
    {
        LockedAccessor_StreamCrit_t crit(_criticalData);
        marker = crit->_av.create_marker(hc::no_scope);
    }

    marker.wait(waitMode());
};

// Causes current stream to wait for specified event to complete:
// Note this does not provide any kind of host serialization.
void ihipStream_t::locked_streamWaitEvent(ihipEventData_t& ecd) {
    LockedAccessor_StreamCrit_t crit(_criticalData);

    crit->_av.create_blocking_marker(ecd.marker(), hc::accelerator_scope);
}


// Causes current stream to wait for specified event to complete:
// Note this does not provide any kind of host serialization.
bool ihipStream_t::locked_eventIsReady(hipEvent_t event) {
    LockedAccessor_EventCrit_t ecrit(event->criticalData());

    return (ecrit->_eventData.marker().is_ready());
}

// Create a marker in this stream.
// Save state in the event so it can track the status of the event.
hc::completion_future ihipStream_t::locked_recordEvent(hipEvent_t event) {
    auto scopeFlag = hc::accelerator_scope;
    // The env var HIP_EVENT_SYS_RELEASE sets the default,
    // The explicit flags override the env var (if specified)
    if (event->_flags & hipEventReleaseToSystem) {
        scopeFlag = hc::system_scope;
    } else if (event->_flags & hipEventReleaseToDevice) {
        scopeFlag = hc::accelerator_scope;
    } else {
        scopeFlag = HIP_EVENT_SYS_RELEASE ? hc::system_scope : hc::accelerator_scope;
    }

    // Lock the stream to prevent simultaneous access
    LockedAccessor_StreamCrit_t crit(_criticalData);
    return crit->_av.create_marker(scopeFlag);
};

//=============================================================================


//-------------------------------------------------------------------------------------------------


//---
const ihipDevice_t* ihipStream_t::getDevice() const { return _ctx->getDevice(); };


ihipCtx_t* ihipStream_t::getCtx() const { return _ctx; };


//--
// Lock the stream to prevent other threads from intervening.
LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand() {
    LockedAccessor_StreamCrit_t crit(_criticalData, false /*no unlock at destruction*/);


    return crit;
}


//---
// Must be called after kernel finishes, this releases the lock on the stream so other commands can
// submit.
void ihipStream_t::lockclose_postKernelCommand(const char* kernelName, hc::accelerator_view* av, bool unlockPostponed) {
    bool blockThisKernel = false;

    if (!g_hipLaunchBlockingKernels.empty()) {
        std::string kernelNameString(kernelName);
        for (auto o = g_hipLaunchBlockingKernels.begin(); o != g_hipLaunchBlockingKernels.end();
             o++) {
            if ((*o == kernelNameString)) {
                // printf ("force blocking for kernel %s\n", o->c_str());
                blockThisKernel = true;
            }
        }
    }

    if (HIP_LAUNCH_BLOCKING || blockThisKernel) {
        // TODO - fix this so it goes through proper stream::wait() call.// direct wait OK since we
        // know the stream is locked.
        av->wait(hc::hcWaitModeActive);
        tprintf(DB_SYNC, "%s LAUNCH_BLOCKING for kernel '%s' completion\n", ToString(this).c_str(),
                kernelName);
    }

    // if unlockPostponed is true then this stream will be unlocked later (e.g., see hipExtLaunchMultiKernelMultiDevice for a sample call)
    if (!unlockPostponed) {
        _criticalData.unlock();  // paired with lock from lockopen_preKernelCommand.
    }
};


//=============================================================================
// Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted.
// The packed _peerAgents can efficiently be used on each memory allocation.
template <>
void ihipCtxCriticalBase_t<CtxMutex>::recomputePeerAgents() {
    _peerCnt = 0;
    std::for_each(_peers.begin(), _peers.end(), [this](ihipCtx_t* ctx) {
        _peerAgents[_peerCnt++] = ctx->getDevice()->_hsaAgent;
    });
}


template <>
bool ihipCtxCriticalBase_t<CtxMutex>::isPeerWatcher(const ihipCtx_t* peer) {
    auto match = std::find_if(_peers.begin(), _peers.end(), [=](const ihipCtx_t* d) {
        return d->getDeviceNum() == peer->getDeviceNum();
    });

    return (match != std::end(_peers));
}


template <>
bool ihipCtxCriticalBase_t<CtxMutex>::addPeerWatcher(const ihipCtx_t* thisCtx,
                                                     ihipCtx_t* peerWatcher) {
    auto match = std::find(_peers.begin(), _peers.end(), peerWatcher);
    if (match == std::end(_peers)) {
        // Not already a peer, let's update the list:
        tprintf(DB_COPY, "addPeerWatcher.  Allocations on %s now visible to peerWatcher %s.\n",
                thisCtx->toString().c_str(), peerWatcher->toString().c_str());
        _peers.push_back(peerWatcher);
        recomputePeerAgents();
        return true;
    }

    // If we get here - peer was already on list, silently ignore.
    return false;
}


template <>
bool ihipCtxCriticalBase_t<CtxMutex>::removePeerWatcher(const ihipCtx_t* thisCtx,
                                                        ihipCtx_t* peerWatcher) {
    auto match = std::find(_peers.begin(), _peers.end(), peerWatcher);
    if (match != std::end(_peers)) {
        // Found a valid peer, let's remove it.
        tprintf(
            DB_COPY,
            "removePeerWatcher.  Allocations on %s no longer visible to former peerWatcher %s.\n",
            thisCtx->toString().c_str(), peerWatcher->toString().c_str());
        _peers.remove(peerWatcher);
        recomputePeerAgents();
        return true;
    } else {
        return false;
    }
}


template <>
void ihipCtxCriticalBase_t<CtxMutex>::resetPeerWatchers(ihipCtx_t* thisCtx) {
    tprintf(DB_COPY, "resetPeerWatchers for context=%s\n", thisCtx->toString().c_str());
    _peers.clear();
    _peerCnt = 0;
    addPeerWatcher(thisCtx, thisCtx);  // peer-list always contains self agent.
}


template <>
void ihipCtxCriticalBase_t<CtxMutex>::printPeerWatchers(FILE* f) const {
    for (auto iter = _peers.begin(); iter != _peers.end(); iter++) {
        fprintf(f, "%s ", (*iter)->toString().c_str());
    };
}


template <>
void ihipCtxCriticalBase_t<CtxMutex>::addStream(ihipStream_t* stream) {
    stream->_id = _streams.size();
    _streams.push_back(stream);
    tprintf(DB_SYNC, " addStream: %s\n", ToString(stream).c_str());
}

template <>
void ihipDeviceCriticalBase_t<DeviceMutex>::addContext(ihipCtx_t* ctx) {
    _ctxs.push_back(ctx);
    tprintf(DB_SYNC, " addContext: %s\n", ToString(ctx).c_str());
}

//=============================================================================

//=================================================================================================
// ihipDevice_t
//=================================================================================================
ihipDevice_t::ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator& acc)
    : _deviceId(deviceId), _acc(acc), _state(0), _criticalData(this) {
    hsa_agent_t* agent = static_cast<hsa_agent_t*>(acc.get_hsa_agent());
    if (agent) {
        int err = hsa_agent_get_info(
            *agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &_computeUnits);
        if (err != HSA_STATUS_SUCCESS) {
            _computeUnits = 1;
        }

        _hsaAgent = *agent;
    } else {
        _hsaAgent.handle = static_cast<uint64_t>(-1);
    }

    initProperties(&_props);


    _primaryCtx = new ihipCtx_t(this, deviceCnt, hipDeviceMapHost);
}


ihipDevice_t::~ihipDevice_t() {
    delete _primaryCtx;
    _primaryCtx = NULL;
}

void ihipDevice_t::locked_removeContext(ihipCtx_t* c) {
    LockedAccessor_DeviceCrit_t crit(_criticalData);

    crit->ctxs().remove(c);
    tprintf(DB_SYNC, " locked_removeContext: %s\n", ToString(c).c_str());
}


void ihipDevice_t::locked_reset() {
    // Obtain mutex access to the device critical data, release by destructor
    LockedAccessor_DeviceCrit_t crit(_criticalData);


    //---
    // Wait for pending activity to complete?  TODO - check if this is required behavior:
    tprintf(DB_SYNC, "locked_reset waiting for activity to complete.\n");

    // Reset and remove streams:
    // Delete all created streams including the default one.
    for (auto ctxI = crit->const_ctxs().begin(); ctxI != crit->const_ctxs().end(); ctxI++) {
        ihipCtx_t* ctx = *ctxI;
        (*ctxI)->locked_reset();
        tprintf(DB_SYNC, " ctx cleanup %s\n", ToString(ctx).c_str());

        delete ctx;
    }
    // Clear the list.
    crit->ctxs().clear();


    // reset _primaryCtx
    _primaryCtx->locked_reset();
    tprintf(DB_SYNC, " _primaryCtx cleanup %s\n", ToString(_primaryCtx).c_str());
    // Reset and release all memory stored in the tracker:
    // Reset will remove peer mapping so don't need to do this explicitly.
    // FIXME - This is clearly a non-const action!  Is this a context reset or a device reset -
    // maybe should reference count?

    _state = 0;
    am_memtracker_reset(_acc);

    // FIXME - Calling am_memtracker_reset is really bad since it destroyed all buffers allocated by
    // the HCC runtime as well such as the printf buffer.  Re-initialze the printf buffer as a
    // workaround for now.
#ifdef HC_FEATURE_PRINTF
    Kalmar::getContext()->initPrintfBuffer();
#endif
};

#define ErrorCheck(x) error_check(x, __LINE__, __FILE__)

void error_check(hsa_status_t hsa_error_code, int line_num, std::string str) {
    if ((hsa_error_code != HSA_STATUS_SUCCESS) && (hsa_error_code != HSA_STATUS_INFO_BREAK)) {
        printf("HSA reported error!\n In file: %s\nAt line: %d\n", str.c_str(), line_num);
    }
}


//---
// Helper for initProperties
// Determines if the given agent is of type HSA_DEVICE_TYPE_GPU and counts it.
static hsa_status_t countGpuAgents(hsa_agent_t agent, void* data) {
    if (data == NULL) {
        return HSA_STATUS_ERROR_INVALID_ARGUMENT;
    }
    hsa_device_type_t device_type;
    hsa_status_t status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
    if (status != HSA_STATUS_SUCCESS) {
        return status;
    }
    if (device_type == HSA_DEVICE_TYPE_GPU) {
        (*static_cast<int*>(data))++;
    }
    return HSA_STATUS_SUCCESS;
}


hsa_status_t FindGpuDevice(hsa_agent_t agent, void* data) {
    if (data == NULL) {
        return HSA_STATUS_ERROR_INVALID_ARGUMENT;
    }

    hsa_device_type_t hsa_device_type;
    hsa_status_t hsa_error_code =
        hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &hsa_device_type);
    if (hsa_error_code != HSA_STATUS_SUCCESS) {
        return hsa_error_code;
    }

    if (hsa_device_type == HSA_DEVICE_TYPE_GPU) {
        *((hsa_agent_t*)data) = agent;
        return HSA_STATUS_INFO_BREAK;
    }

    return HSA_STATUS_SUCCESS;
}

hsa_status_t GetDevicePool(hsa_amd_memory_pool_t pool, void* data) {
    if (NULL == data) {
        return HSA_STATUS_ERROR_INVALID_ARGUMENT;
    }

    hsa_status_t err;
    hsa_amd_segment_t segment;
    uint32_t flag;

    err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
    ErrorCheck(err);
    if (HSA_AMD_SEGMENT_GLOBAL != segment) return HSA_STATUS_SUCCESS;
    err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag);
    ErrorCheck(err);
    *((hsa_amd_memory_pool_t*)data) = pool;
    return HSA_STATUS_SUCCESS;
}


int checkAccess(hsa_agent_t agent, hsa_amd_memory_pool_t pool) {
    hsa_status_t err;
    hsa_amd_memory_pool_access_t access;
    err = hsa_amd_agent_memory_pool_get_info(agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
                                             &access);
    ErrorCheck(err);
    return access;
}

hsa_status_t get_pool_info(hsa_amd_memory_pool_t pool, void* data) {
    hsa_status_t err;
    hipDeviceProp_t* p_prop = reinterpret_cast<hipDeviceProp_t*>(data);
    uint32_t region_segment;

    // Get pool segment
    err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &region_segment);
    ErrorCheck(err);

    switch (region_segment) {
        case HSA_REGION_SEGMENT_READONLY:
            err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
                                               &(p_prop->totalConstMem));
            break;
        case HSA_REGION_SEGMENT_GROUP:
            err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
                                               &(p_prop->sharedMemPerBlock));
            break;
        default:
            break;
    }
    return err;
}


// Determines if the given agent is of type HSA_DEVICE_TYPE_GPU and counts it.
static hsa_status_t findCpuAgent(hsa_agent_t agent, void* data) {
    hsa_device_type_t device_type;
    hsa_status_t status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
    if (status != HSA_STATUS_SUCCESS) {
        return status;
    }
    if (device_type == HSA_DEVICE_TYPE_CPU) {
        (*static_cast<hsa_agent_t*>(data)) = agent;
        return HSA_STATUS_INFO_BREAK;
    }

    return HSA_STATUS_SUCCESS;
}


#define DeviceErrorCheck(x)                                                                        \
    if (x != HSA_STATUS_SUCCESS) {                                                                 \
        return hipErrorInvalidDevice;                                                              \
    }

//---
// Initialize properties for the device.
// Call this once when the ihipDevice_t is created:
hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) {
    hipError_t e = hipSuccess;
    hsa_status_t err;

    memset(prop, 0, sizeof(hipDeviceProp_t));

    if (_hsaAgent.handle == -1) {
        return hipErrorInvalidDevice;
    }

    // Iterates over the agents to determine Multiple GPU devices
    // using the countGpuAgents callback.
    //! @bug : on HCC, isMultiGpuBoard returns True if system contains multiple GPUS (rather than if
    //! GPU is on a multi-ASIC board)
    int gpuAgentsCount = 0;
    err = hsa_iterate_agents(countGpuAgents, &gpuAgentsCount);
    if (err == HSA_STATUS_INFO_BREAK) {
        err = HSA_STATUS_SUCCESS;
    }
    DeviceErrorCheck(err);
    prop->isMultiGpuBoard = 0 ? gpuAgentsCount < 2 : 1;

    // Get agent name

    err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME,
                             &(prop->name));
    DeviceErrorCheck(err);

    char archName[256];
    err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_NAME, &archName);

    prop->gcnArch = atoi(archName + 3);

    DeviceErrorCheck(err);

    // Get agent node
    uint32_t node;
    err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_NODE, &node);
    DeviceErrorCheck(err);

    // Get wavefront size
    err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &prop->warpSize);
    DeviceErrorCheck(err);

    // Get max total number of work-items in a workgroup
    err =
        hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &prop->maxThreadsPerBlock);
    DeviceErrorCheck(err);

    // Get max number of work-items of each dimension of a work-group
    uint16_t work_group_max_dim[3];
    err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, work_group_max_dim);
    DeviceErrorCheck(err);
    for (int i = 0; i < 3; i++) {
        prop->maxThreadsDim[i] = work_group_max_dim[i];
    }

    hsa_dim3_t grid_max_dim;
    err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim);
    DeviceErrorCheck(err);
    prop->maxGridSize[0] = (int)((grid_max_dim.x == UINT32_MAX) ? (INT32_MAX) : grid_max_dim.x);
    prop->maxGridSize[1] = (int)((grid_max_dim.y == UINT32_MAX) ? (INT32_MAX) : grid_max_dim.y);
    prop->maxGridSize[2] = (int)((grid_max_dim.z == UINT32_MAX) ? (INT32_MAX) : grid_max_dim.z);

    // Get Max clock frequency
    err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
                             &prop->clockRate);
    prop->clockRate *= 1000.0;  // convert Mhz to Khz.
    DeviceErrorCheck(err);

    uint64_t counterHz;
    err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &counterHz);
    DeviceErrorCheck(err);
    prop->clockInstructionRate = counterHz / 1000;

    // Get Agent BDFID (bus/device/function ID)
    uint16_t bdf_id = 1;
    err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &bdf_id);
    DeviceErrorCheck(err);

    // BDFID is 16bit uint: [8bit - BusID | 5bit - Device ID | 3bit - Function/DomainID]
    prop->pciDomainID = bdf_id & 0x7;
    prop->pciDeviceID = (bdf_id >> 3) & 0x1F;
    prop->pciBusID = (bdf_id >> 8) & 0xFF;

    // Masquerade as a 3.0-level device. This will change as more HW functions are properly
    // supported. Application code should use the arch.has* to do detailed feature detection.
    prop->major = 3;
    prop->minor = 0;

    // Get number of Compute Unit
    err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
                             &(prop->multiProcessorCount));
    DeviceErrorCheck(err);

    // TODO-hsart - this appears to return 0?
    uint32_t cache_size[4];
    err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_CACHE_SIZE, cache_size);
    DeviceErrorCheck(err);
    prop->l2CacheSize = cache_size[1];

    /* Computemode for HSA Devices is always : cudaComputeModeDefault */
    prop->computeMode = 0;

    _isLargeBar = _acc.has_cpu_accessible_am();

    // Get Max Threads Per Multiprocessor
    uint32_t max_waves_per_cu;
    err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
                             &max_waves_per_cu);
    DeviceErrorCheck(err);
    prop->maxThreadsPerMultiProcessor = prop->warpSize * max_waves_per_cu;

    // Get memory properties
    err = hsa_amd_agent_iterate_memory_pools(_hsaAgent, get_pool_info, prop);
    if (err == HSA_STATUS_INFO_BREAK) {
        err = HSA_STATUS_SUCCESS;
    }
    DeviceErrorCheck(err);

    // Get the size of the pool we are using for Accelerator Memory allocations:
    hsa_region_t* am_region = static_cast<hsa_region_t*>(_acc.get_hsa_am_region());
    err = hsa_region_get_info(*am_region, HSA_REGION_INFO_SIZE, &prop->totalGlobalMem);
    DeviceErrorCheck(err);
    // maxSharedMemoryPerMultiProcessor should be as the same as group memory size.
    // Group memory will not be paged out, so, the physical memory size is the total shared memory
    // size, and also equal to the group pool size.
    prop->maxSharedMemoryPerMultiProcessor = prop->totalGlobalMem;

    // Get Max memory clock frequency
    err =
        hsa_region_get_info(*am_region, (hsa_region_info_t)HSA_AMD_REGION_INFO_MAX_CLOCK_FREQUENCY,
                            &prop->memoryClockRate);
    DeviceErrorCheck(err);
    prop->memoryClockRate *= 1000.0;  // convert Mhz to Khz.

    // Get global memory bus width in bits
    err = hsa_region_get_info(*am_region, (hsa_region_info_t)HSA_AMD_REGION_INFO_BUS_WIDTH,
                              &prop->memoryBusWidth);
    DeviceErrorCheck(err);

    // Set feature flags - these are all mandatory for HIP on HCC path:
    // Some features are under-development and future revs may support flags that are currently 0.
    // Reporting of these flags should be synchronized with the HIP_ARCH* compile-time defines in
    // hip_runtime.h

    prop->arch.hasGlobalInt32Atomics = 1;
    prop->arch.hasGlobalFloatAtomicExch = 1;
    prop->arch.hasSharedInt32Atomics = 1;
    prop->arch.hasSharedFloatAtomicExch = 1;
    prop->arch.hasFloatAtomicAdd = 1;  // supported with CAS loop, but is supported
    prop->arch.hasGlobalInt64Atomics = 1;
    prop->arch.hasSharedInt64Atomics = 1;
    prop->arch.hasDoubles = 1;
    prop->arch.hasWarpVote = 1;
    prop->arch.hasWarpBallot = 1;
    prop->arch.hasWarpShuffle = 1;
    prop->arch.hasFunnelShift = 0;  // TODO-hcc
    prop->arch.hasThreadFenceSystem = 1;
    prop->arch.hasSyncThreadsExt = 0;  // TODO-hcc
    prop->arch.hasSurfaceFuncs = 0;    // TODO-hcc
    prop->arch.has3dGrid = 1;
    prop->arch.hasDynamicParallelism = 0;

    prop->concurrentKernels =
        1;  // All ROCm hardware supports executing multiple kernels concurrently

    prop->canMapHostMemory = 1;  // All ROCm devices can map host memory
    prop->totalConstMem = 16384;
#if 0
    // TODO - code broken below since it always returns 1.
    // Are the flags part of the context or part of the device?
    if ( _device_flags | hipDeviceMapHost) {
        prop->canMapHostMemory = 1;
    } else {
        prop->canMapHostMemory = 0;
    }
#endif
    // Get profile
    hsa_profile_t agent_profile;
    err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_PROFILE, &agent_profile);
    DeviceErrorCheck(err);
    if(agent_profile == HSA_PROFILE_FULL) {
        prop->integrated = 1;
    }

    // Enable the cooperative group for gfx9+
    prop->cooperativeLaunch = (prop->gcnArch < 900) ? 0 : 1;
    prop->cooperativeMultiDeviceLaunch = (prop->gcnArch < 900) ? 0 : 1;

    err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS,
          &prop->maxTexture1D);
    DeviceErrorCheck(err);

    err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS,
          prop->maxTexture2D);
    DeviceErrorCheck(err);

    err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS,
          prop->maxTexture3D);
    DeviceErrorCheck(err);

    // Get Agent HDP Flush Register Memory
    hsa_amd_hdp_flush_t hdpinfo;
    err = hsa_agent_get_info(_hsaAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_HDP_FLUSH, &hdpinfo);
    DeviceErrorCheck(err);

    prop->hdpMemFlushCntl = hdpinfo.HDP_MEM_FLUSH_CNTL;
    prop->hdpRegFlushCntl = hdpinfo.HDP_REG_FLUSH_CNTL;

    return e;
}


//=================================================================================================
// ihipCtx_t
//=================================================================================================
ihipCtx_t::ihipCtx_t(ihipDevice_t* device, unsigned deviceCnt, unsigned flags)
    : _ctxFlags(flags), _device(device), _criticalData(this, deviceCnt) {
    // locked_reset();
    LockedAccessor_CtxCrit_t crit(_criticalData);
    _defaultStream = new ihipStream_t(this, getDevice()->_acc.get_default_view(), hipStreamDefault);
    crit->addStream(_defaultStream);


    // Reset peer list to just me:
    crit->resetPeerWatchers(this);
    tprintf(DB_SYNC, "created ctx with defaultStream=%p (%s)\n", _defaultStream,
            ToString(_defaultStream).c_str());
};


ihipCtx_t::~ihipCtx_t() {
    if (_defaultStream) {
        delete _defaultStream;
        _defaultStream = NULL;
    }
}
// Reset the device - this is called from hipDeviceReset.
// Device may be reset multiple times, and may be reset after init.
void ihipCtx_t::locked_reset() {
    // Obtain mutex access to the device critical data, release by destructor
    LockedAccessor_CtxCrit_t crit(_criticalData);


    //---
    // Wait for pending activity to complete?  TODO - check if this is required behavior:
    tprintf(DB_SYNC, "locked_reset waiting for activity to complete.\n");

    // Reset and remove streams:
    // Delete all created streams including the default one.
    for (auto streamI = crit->const_streams().begin(); streamI != crit->const_streams().end();
         streamI++) {
        ihipStream_t* stream = *streamI;
        (*streamI)->locked_wait();
        tprintf(DB_SYNC, " delete %s\n", ToString(stream).c_str());

        delete stream;
    }
    // Clear the list.
    crit->streams().clear();


    // Create a fresh default stream and add it:
    _defaultStream = new ihipStream_t(this, getDevice()->_acc.get_default_view(), hipStreamDefault);
    crit->addStream(_defaultStream);

#if 0
    // Reset peer list to just me:
    crit->resetPeerWatchers(this);

    // Reset and release all memory stored in the tracker:
    // Reset will remove peer mapping so don't need to do this explicitly.
    // FIXME - This is clearly a non-const action!  Is this a context reset or a device reset - maybe should reference count?
    ihipDevice_t *device = getWriteableDevice();
    device->_state = 0;
    am_memtracker_reset(device->_acc);
#endif
};


//---
std::string ihipCtx_t::toString() const {
    std::ostringstream ss;
    ss << this;
    return ss.str();
};


//----


//=================================================================================================
// Utility functions, these are not part of the public HIP API
//=================================================================================================

//=================================================================================================


//   This called for submissions that are sent to the null/default stream.  This routine ensures
//   that this new command waits for activity in the other streams to complete before proceeding.
//
//   HIP_SYNC_NULL_STREAM=0 does all dependency resolutiokn on the GPU
//   HIP_SYNC_NULL_STREAM=1 s legacy non-optimal mode which conservatively waits on host.
//
//   If waitOnSelf is set, this additionally waits for the default stream to empty.
//   In new HIP_SYNC_NULL_STREAM=0 mode, this enqueues a marker which causes the default stream to
//   wait for other activity, but doesn't actually block the host.  If host blocking is desired, the
//   caller should set syncHost.
//
//   syncToHost causes host to wait for the stream to finish.
//   Note HIP_SYNC_NULL_STREAM=1 path always sync to Host.
void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf, bool syncHost) {
    LockedAccessor_CtxCrit_t crit(_criticalData);

    tprintf(DB_SYNC, "syncDefaultStream \n");

    // Vector of ops sent to each stream that will complete before ops sent to null stream:
    std::vector<hc::completion_future> depOps;

    for (auto streamI = crit->const_streams().begin(); streamI != crit->const_streams().end();
         streamI++) {
        ihipStream_t* stream = *streamI;

        // Don't wait for streams that have "opted-out" of syncing with NULL stream.
        // And - don't wait for the NULL stream, unless waitOnSelf specified.
        bool waitThisStream = (!(stream->_flags & hipStreamNonBlocking)) &&
                              (waitOnSelf || (stream != _defaultStream));

        if (HIP_SYNC_NULL_STREAM) {
            if (waitThisStream) {
                stream->locked_wait();
            }
        } else {
            if (waitThisStream) {
                LockedAccessor_StreamCrit_t streamCrit(stream->criticalData());

                // The last marker will provide appropriate visibility:
                if (!streamCrit->_av.get_is_empty()) {
                    depOps.push_back(streamCrit->_av.create_marker(hc::accelerator_scope));
                    tprintf(DB_SYNC, "  push marker to wait for stream=%s\n",
                            ToString(stream).c_str());
                } else {
                    tprintf(DB_SYNC, "  skipped stream=%s since it is empty\n",
                            ToString(stream).c_str());
                }
            }
        }
    }


    // Enqueue a barrier to wait on all the barriers we sent above:
    if (!HIP_SYNC_NULL_STREAM && !depOps.empty()) {
        LockedAccessor_StreamCrit_t defaultStreamCrit(_defaultStream->criticalData());
        tprintf(DB_SYNC, "  null-stream wait on %zu non-empty streams. sync_host=%d\n",
                depOps.size(), syncHost);
        hc::completion_future defaultCf = defaultStreamCrit->_av.create_blocking_marker(
            depOps.begin(), depOps.end(), hc::accelerator_scope);
        if (syncHost) {
            defaultCf.wait();  // TODO - account for active or blocking here.
        }
    }

    tprintf(DB_SYNC, "  syncDefaultStream depOps=%zu\n", depOps.size());
}


//---
void ihipCtx_t::locked_removeStream(ihipStream_t* s) {
    LockedAccessor_CtxCrit_t crit(_criticalData);

    crit->streams().remove(s);
}


//---
// Heavyweight synchronization that waits on all streams, ignoring hipStreamNonBlocking flag.
void ihipCtx_t::locked_waitAllStreams() {
    LockedAccessor_CtxCrit_t crit(_criticalData);

    tprintf(DB_SYNC, "waitAllStream\n");
    for (auto streamI = crit->const_streams().begin(); streamI != crit->const_streams().end();
         streamI++) {
        (*streamI)->locked_wait();
    }
}


std::string HIP_DB_string(unsigned db) {
    std::string dbStr;
    bool first = true;
    for (int i = 0; i < DB_MAX_FLAG; i++) {
        if (db & (1 << i)) {
            if (!first) {
                dbStr += "+";
            };
            dbStr += dbName[i]._color;
            dbStr += dbName[i]._shortName;
            dbStr += KNRM;
            first = false;
        };
    }

    return dbStr;
}

// Callback used to process HIP_DB input, supports either
// integer or flag names separated by +
std::string HIP_DB_callback(void* var_ptr, const char* envVarString) {
    int* var_ptr_int = static_cast<int*>(var_ptr);

    std::string e(envVarString);
    trim(&e);
    if (!e.empty() && isdigit(e.c_str()[0])) {
        long int v = strtol(envVarString, NULL, 0);
        *var_ptr_int = (int)(v);
    } else {
        *var_ptr_int = 0;
        std::vector<std::string> tokens;
        tokenize(e, '+', &tokens);
        for (auto t = tokens.begin(); t != tokens.end(); t++) {
            for (int i = 0; i < DB_MAX_FLAG; i++) {
                if (!strcmp(t->c_str(), dbName[i]._shortName)) {
                    *var_ptr_int |= (1 << i);
                }  // TODO - else throw error?
            }
        }
    }

    return HIP_DB_string(*var_ptr_int);
    ;
}


// Callback used to process list of visible devices.
std::string HIP_VISIBLE_DEVICES_callback(void* var_ptr, const char* envVarString) {
    // Parse the string stream of env and store the device ids to g_hip_visible_devices global
    // variable
    std::string str = envVarString;
    std::istringstream ss(str);
    std::string device_id;
    // Clean up the defult value
    g_hip_visible_devices.clear();
    g_visible_device = true;
    // Read the visible device numbers
    while (std::getline(ss, device_id, ',')) {
        if (atoi(device_id.c_str()) >= 0) {
            g_hip_visible_devices.push_back(atoi(device_id.c_str()));
        } else {  // Any device number after invalid number will not present
            break;
        }
    }

    std::string valueString;
    // Print out the number of ids
    for (int i = 0; i < g_hip_visible_devices.size(); i++) {
        valueString += std::to_string((g_hip_visible_devices[i]));
        valueString += ' ';
    }

    return valueString;
}


// TODO - change last arg to pointer.
void parseTrigger(std::string triggerString, std::vector<ProfTrigger>& profTriggers) {
    std::vector<std::string> tidApiTokens;
    tokenize(std::string(triggerString), ',', &tidApiTokens);
    for (auto t = tidApiTokens.begin(); t != tidApiTokens.end(); t++) {
        std::vector<std::string> oneToken;
        // std::cout << "token=" << *t << "\n";
        tokenize(std::string(*t), '.', &oneToken);
        int tid = 1;
        uint64_t apiTrigger = 0;
        if (oneToken.size() == 1) {
            // the case with just apiNum
            apiTrigger = std::strtoull(oneToken[0].c_str(), nullptr, 0);
        } else if (oneToken.size() == 2) {
            // the case with tid.apiNum
            tid = std::strtoul(oneToken[0].c_str(), nullptr, 0);
            apiTrigger = std::strtoull(oneToken[1].c_str(), nullptr, 0);
        } else {
            throw ihipException(hipErrorRuntimeOther);  // TODO -> bad env var?
        }

        if (tid > 10000) {
            throw ihipException(hipErrorRuntimeOther);  // TODO -> bad env var?
        } else {
            profTriggers.resize(tid + 1);
            // std::cout << "tid:" << tid << " add: " << apiTrigger << "\n";
            profTriggers[tid].add(apiTrigger);
        }
    }


    for (int tid = 1; tid < profTriggers.size(); tid++) {
        profTriggers[tid].sort();
        profTriggers[tid].print(tid);
    }
}


void HipReadEnv() {
    /*
     * Environment variables
     */
    g_hip_visible_devices.push_back(0); /* Set the default value of visible devices */
    READ_ENV_I(release, HIP_PRINT_ENV, 0, "Print HIP environment variables.");
    //-- READ HIP_PRINT_ENV env first, since it has impact on later env var reading

    // TODO: In HIP/hcc, this variable blocks after both kernel commmands and data transfer.  Maybe
    // should be bit-mask for each command type?
    READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING,
               "Make HIP kernel launches 'host-synchronous', so they block until any kernel "
               "launches. Alias: CUDA_LAUNCH_BLOCKING.");
    READ_ENV_S(release, HIP_LAUNCH_BLOCKING_KERNELS, 0,
               "Comma-separated list of kernel names to make host-synchronous, so they block until "
               "completed.");
    if (!HIP_LAUNCH_BLOCKING_KERNELS.empty()) {
        tokenize(HIP_LAUNCH_BLOCKING_KERNELS, ',', &g_hipLaunchBlockingKernels);
    }
    READ_ENV_I(release, HIP_API_BLOCKING, 0,
               "Make HIP APIs 'host-synchronous', so they block until completed.  Impacts "
               "hipMemcpyAsync, hipMemsetAsync.");

    READ_ENV_I(release, HIP_HIDDEN_FREE_MEM, 0,
               "Amount of memory to hide from the free memory reported by hipMemGetInfo, specified "
               "in MB. Impacts hipMemGetInfo.");

    READ_ENV_C(release, HIP_DB, 0,
               "Print debug info.  Bitmask (HIP_DB=0xff) or flags separated by '+' "
               "(HIP_DB=api+sync+mem+copy+fatbin)",
               HIP_DB_callback);
    if ((HIP_DB & (1 << DB_API)) && (HIP_TRACE_API == 0)) {
        // Set HIP_TRACE_API default before we read it, so it is printed correctly.
        HIP_TRACE_API = 1;
    }


    READ_ENV_I(release, HIP_TRACE_API, 0,
               "Trace each HIP API call.  Print function name and return code to stderr as program "
               "executes.");
    READ_ENV_S(release, HIP_TRACE_API_COLOR, 0,
               "Color to use for HIP_API.  None/Red/Green/Yellow/Blue/Magenta/Cyan/White");
    READ_ENV_I(release, HIP_PROFILE_API, 0,
               "Add HIP API markers to ATP file generated with CodeXL. 0x1=short API name, "
               "0x2=full API name including args.");
    READ_ENV_S(release, HIP_DB_START_API, 0,
               "Comma-separated list of tid.api_seq_num for when to start debug and profiling.");
    READ_ENV_S(release, HIP_DB_STOP_API, 0,
               "Comma-separated list of tid.api_seq_num for when to stop debug and profiling.");

    READ_ENV_C(release, HIP_VISIBLE_DEVICES, CUDA_VISIBLE_DEVICES,
               "Only devices whose index is present in the sequence are visible to HIP "
               "applications and they are enumerated in the order of sequence.",
               HIP_VISIBLE_DEVICES_callback);


    READ_ENV_I(release, HIP_WAIT_MODE, 0,
               "Force synchronization mode. 1= force yield, 2=force spin, 0=defaults specified in "
               "application");
    READ_ENV_I(release, HIP_FORCE_P2P_HOST, 0,
               "Force use of host/staging copy for peer-to-peer copies.1=always use copies, "
               "2=always return false for hipDeviceCanAccessPeer");
    READ_ENV_I(release, HIP_FORCE_SYNC_COPY, 0,
               "Force all copies (even hipMemcpyAsync) to use sync copies");
    READ_ENV_I(release, HIP_FAIL_SOC, 0,
               "Fault on Sub-Optimal-Copy, rather than use a slower but functional implementation. "
               " Bit 0x1=Fail on async copy with unpinned memory.  Bit 0x2=Fail peer copy rather "
               "than use staging buffer copy");

    READ_ENV_I(release, HIP_SYNC_HOST_ALLOC, 0,
               "Sync before and after all host memory allocations.  May help stability");
    READ_ENV_I(release, HIP_INIT_ALLOC, 0,
               "If not -1, initialize allocated memory to specified byte");
    READ_ENV_I(release, HIP_SYNC_NULL_STREAM, 0, "Synchronize on host for null stream submissions");
    READ_ENV_I(release, HIP_FORCE_NULL_STREAM, 0,
               "Force all stream allocations to secretly return the null stream");

    READ_ENV_I(release, HIP_SYNC_STREAM_WAIT, 0, "hipStreamWaitEvent will synchronize to host");

    READ_ENV_I(release, HIP_SYNC_FREE, 0,
               "Force all calls to hipFree to sync all devices and all streams");

    READ_ENV_I(release, HIP_HOST_COHERENT, 0,
               "If set, all host memory will be allocated as fine-grained system memory.  This "
               "allows threadfence_system to work but prevents host memory from being cached on "
               "GPU which may have performance impact.");


    READ_ENV_I(release, HCC_OPT_FLUSH, 0,
               "When set, use agent-scope fence operations rather than system-scope fence "
               "operationsflush when possible. This flag controls both HIP and HCC behavior.");
    READ_ENV_I(release, HIP_EVENT_SYS_RELEASE, 0,
               "If set, event are created with hipEventReleaseToSystem by default.  If 0, events "
               "are created with hipEventReleaseToDevice by default.  The defaults can be "
               "overridden by specifying hipEventReleaseToSystem or hipEventReleaseToDevice flag "
               "when creating the event.");

    READ_ENV_I(release, HIP_DUMP_CODE_OBJECT, 0,
               "If set, dump code object as __hip_dump_code_object[nnnn].o in the current directory,"
               "where nnnn is the index number.");

    // Some flags have both compile-time and runtime flags - generate a warning if user enables the
    // runtime flag but the compile-time flag is disabled.
    if (HIP_DB && !COMPILE_HIP_DB) {
        fprintf(stderr,
                "warning: env var HIP_DB=0x%x but COMPILE_HIP_DB=0.  (perhaps enable "
                "COMPILE_HIP_DB in src code before compiling?)\n",
                HIP_DB);
    }

    if (HIP_TRACE_API && !COMPILE_HIP_TRACE_API) {
        fprintf(stderr,
                "warning: env var HIP_TRACE_API=0x%x but COMPILE_HIP_TRACE_API=0.  (perhaps enable "
                "COMPILE_HIP_TRACE_API in src code before compiling?)\n",
                HIP_DB);
    }
    if (HIP_TRACE_API) {
        HIP_DB |= 0x1;
    }

    if (HIP_PROFILE_API && !COMPILE_HIP_ATP_MARKER) {
        fprintf(stderr,
                "warning: env var HIP_PROFILE_API=0x%x but COMPILE_HIP_ATP_MARKER=0.  (perhaps "
                "enable COMPILE_HIP_ATP_MARKER in src code before compiling?)\n",
                HIP_PROFILE_API);
        HIP_PROFILE_API = 0;
    }

    if (HIP_DB) {
        fprintf(stderr, "HIP_DB=0x%x [%s]\n", HIP_DB, HIP_DB_string(HIP_DB).c_str());
    }

    std::transform(HIP_TRACE_API_COLOR.begin(), HIP_TRACE_API_COLOR.end(),
                   HIP_TRACE_API_COLOR.begin(), ::tolower);

    if (HIP_TRACE_API_COLOR == "none") {
        API_COLOR = "";
        API_COLOR_END = "";
    } else if (HIP_TRACE_API_COLOR == "red") {
        API_COLOR = KRED;
    } else if (HIP_TRACE_API_COLOR == "green") {
        API_COLOR = KGRN;
    } else if (HIP_TRACE_API_COLOR == "yellow") {
        API_COLOR = KYEL;
    } else if (HIP_TRACE_API_COLOR == "blue") {
        API_COLOR = KBLU;
    } else if (HIP_TRACE_API_COLOR == "magenta") {
        API_COLOR = KMAG;
    } else if (HIP_TRACE_API_COLOR == "cyan") {
        API_COLOR = KCYN;
    } else if (HIP_TRACE_API_COLOR == "white") {
        API_COLOR = KWHT;
    } else {
        fprintf(stderr,
                "warning: env var HIP_TRACE_API_COLOR=%s must be "
                "None/Red/Green/Yellow/Blue/Magenta/Cyan/White",
                HIP_TRACE_API_COLOR.c_str());
    };

    parseTrigger(HIP_DB_START_API, g_dbStartTriggers);
    parseTrigger(HIP_DB_STOP_API, g_dbStopTriggers);
};


//---
// Function called one-time at initialization time to construct a table of all GPU devices.
// HIP/CUDA uses integer "deviceIds" - these are indexes into this table.
// AMP maintains a table of accelerators, but some are emulated - ie for debug or CPU.
// This function creates a vector with only the GPU accelerators.
// It is called with C++11 call_once, which provided thread-safety.
void ihipInit() {
#if COMPILE_HIP_ATP_MARKER
    amdtInitializeActivityLogger();
    amdtScopedMarker("ihipInit", "HIP", NULL);
#endif


    HipReadEnv();


    /*
     * Build a table of valid compute devices.
     */
    auto accs = hc::accelerator::get_all();

    int deviceCnt = 0;
    for (int i = 0; i < accs.size(); i++) {
        if (!accs[i].get_is_emulated()) {
            deviceCnt++;
        }
    };

    // Make sure the hip visible devices are within the deviceCnt range
    for (int i = 0; i < g_hip_visible_devices.size(); i++) {
        if ((g_hip_visible_devices[i] >= deviceCnt) ||(g_hip_visible_devices[i] < 0)){
            // Make sure any DeviceID after invalid DeviceID will be erased.
            g_hip_visible_devices.resize(i);
            break;
        }
    }

    hsa_status_t err = hsa_iterate_agents(findCpuAgent, &g_cpu_agent);
    if (err != HSA_STATUS_INFO_BREAK) {
        // didn't find a CPU.
        g_initDeviceFound = false;
        return;
    }

    g_deviceArray = new ihipDevice_t*[deviceCnt];
    g_deviceCnt = 0;

    if(g_visible_device) {
        for (int i = 0; i < g_hip_visible_devices.size(); i++) {
            int devIndex = g_hip_visible_devices[i];
            if (!accs[devIndex+1].get_is_emulated()) {
                g_deviceArray[g_deviceCnt] = new ihipDevice_t(g_deviceCnt, deviceCnt, accs[devIndex+1]);
                g_deviceCnt++;
            }
        }
    }else {
        for (int i = 0; i < accs.size(); i++) {
            if (!accs[i].get_is_emulated()) {
                g_deviceArray[g_deviceCnt] = new ihipDevice_t(g_deviceCnt, deviceCnt, accs[i]);
                g_deviceCnt++;
            }
        }
    }

    g_allAgents = static_cast<hsa_agent_t*>(malloc((g_deviceCnt + 1) * sizeof(hsa_agent_t)));
    g_allAgents[0] = g_cpu_agent;
    for (int i = 0; i < g_deviceCnt; i++) {
        g_allAgents[i + 1] = g_deviceArray[i]->_hsaAgent;
    }


    g_numLogicalThreads = std::thread::hardware_concurrency();

    // If HIP_VISIBLE_DEVICES is not set, make sure all devices are initialized
    if (!g_visible_device) {
        assert(deviceCnt == g_deviceCnt);
    }

    tprintf(DB_SYNC, "pid=%u %-30s g_numLogicalThreads=%u\n", getpid(), "<ihipInit>",
            g_numLogicalThreads);

    g_initDeviceFound = true;
}

namespace hip_impl {
hipError_t hip_init() {
  static std::once_flag hip_initialized;
  std::call_once(hip_initialized, ihipInit);
  if (g_initDeviceFound) {
      ihipCtxStackUpdate();
      return hipSuccess;
  }
  else {
      return hipErrorInsufficientDriver;
  }
}
}

hipError_t ihipStreamSynchronize(TlsData *tls, hipStream_t stream) {
    hipError_t e = hipSuccess;

    if (stream == hipStreamNull) {
        ihipCtx_t* ctx = ihipGetTlsDefaultCtx();
        ctx->locked_syncDefaultStream(true /*waitOnSelf*/, true /*syncToHost*/);
    } else {
        // note this does not synchornize with the NULL stream:
        stream->locked_wait();
        e = hipSuccess;
    }

    return e;
}

void ihipStreamCallbackHandler(ihipStreamCallback_t* cb) {
    hipError_t e = hipSuccess;

    // Synchronize stream
    tprintf(DB_SYNC, "ihipStreamCallbackHandler wait on stream %s\n",
            ToString(cb->_stream).c_str());
    GET_TLS();
    e = ihipStreamSynchronize(tls, cb->_stream);

    // Call registered callback function
    cb->_callback(cb->_stream, e, cb->_userData);
    delete cb;
}

//---
// Get the stream to use for a command submission.
//
// If stream==NULL synchronize appropriately with other streams and return the default av for the
// device. If stream is valid, return the AV to use.
hipStream_t ihipSyncAndResolveStream(hipStream_t stream, bool lockAcquired) {
    if (stream == hipStreamNull) {
        // Submitting to NULL stream, call locked_syncDefaultStream to wait for all other streams:
        GET_TLS();
        ihipCtx_t* ctx = ihipGetTlsDefaultCtx();
        tprintf(DB_SYNC, "ihipSyncAndResolveStream %s wait on default stream\n",
                ToString(stream).c_str());

#ifndef HIP_API_PER_THREAD_DEFAULT_STREAM
        ctx->locked_syncDefaultStream(false, false);
#endif
        return ctx->_defaultStream;
    } else {
        // Submitting to a "normal" stream, just wait for null stream:
        if (!(stream->_flags & hipStreamNonBlocking)) {
            if (HIP_SYNC_NULL_STREAM) {
                tprintf(DB_SYNC, "ihipSyncAndResolveStream %s host-wait on default stream\n",
                        ToString(stream).c_str());
                stream->getCtx()->_defaultStream->locked_wait();
            } else {
                ihipStream_t* defaultStream = stream->getCtx()->_defaultStream;


                bool needGatherMarker = false;  // used to gather together other markers.
                hc::completion_future dcf;
                {
                    LockedAccessor_StreamCrit_t defaultStreamCrit(defaultStream->criticalData());
                    // TODO - could call create_blocking_marker(queue) or uses existing marker.
                    if (!defaultStreamCrit->_av.get_is_empty()) {
                        needGatherMarker = true;

                        tprintf(DB_SYNC, "  %s adding marker to default %s for dependency\n",
                                ToString(stream).c_str(), ToString(defaultStream).c_str());
                        dcf = defaultStreamCrit->_av.create_marker(hc::accelerator_scope);
                    } else {
                        tprintf(DB_SYNC, "  %s skipping marker since default stream is empty\n",
                                ToString(stream).c_str());
                    }
                }

                if (needGatherMarker) {
                    // ensure any commands sent to this stream wait on the NULL stream before
                    // continuing
                    if (!lockAcquired) {
                        LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData());
                        // TODO - could be "noret" version of create_blocking_marker
                        thisStreamCrit->_av.create_blocking_marker(dcf, hc::accelerator_scope);
                    } else {
                        // this stream is already locked (e.g., call from hipExtLaunchMultiKernelMultiDevice)
                        stream->criticalData()._av.create_blocking_marker(dcf, hc::accelerator_scope);
                    }
                    tprintf(
                        DB_SYNC,
                        "  %s adding marker to wait for freshly recorded default-stream marker \n",
                        ToString(stream).c_str());
                }
            }
        }

        return stream;
    }
}

void ihipPrintKernelLaunch(const char* kernelName, const grid_launch_parm* lp,
                           const hipStream_t stream) {
    if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || HIP_PROFILE_API ||
        (COMPILE_HIP_DB & HIP_TRACE_API)) {
        GET_TLS();
        std::stringstream os;
        os << tls->tidInfo.pid() << " " << tls->tidInfo.tid() << "." << tls->tidInfo.apiSeqNum() << " hipLaunchKernel '"
           << kernelName << "'"
           << " gridDim:" << lp->grid_dim << " groupDim:" << lp->group_dim << " sharedMem:+"
           << lp->dynamic_group_mem_bytes << " " << *stream;

        if (COMPILE_HIP_DB && HIP_TRACE_API) {
            std::string fullStr;
            recordApiTrace(tls, &fullStr, os.str());
        }

        if (HIP_PROFILE_API == 0x1) {
            std::string shortAtpString("hipLaunchKernel:");
            shortAtpString += kernelName;
            MARKER_BEGIN(shortAtpString.c_str(), "HIP");
        } else if (HIP_PROFILE_API == 0x2) {
            MARKER_BEGIN(os.str().c_str(), "HIP");
        }
    }
}

// Called just before a kernel is launched from hipLaunchKernel.
// Allows runtime to track some information about the stream.
hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm* lp,
                                const char* kernelNameStr, bool lockAcquired) {
    stream = ihipSyncAndResolveStream(stream, lockAcquired);
    lp->grid_dim.x = grid.x;
    lp->grid_dim.y = grid.y;
    lp->grid_dim.z = grid.z;
    lp->group_dim.x = block.x;
    lp->group_dim.y = block.y;
    lp->group_dim.z = block.z;
    lp->barrier_bit = barrier_bit_queue_default;
    lp->launch_fence = -1;

    if (!lockAcquired) {
        auto crit = stream->lockopen_preKernelCommand();
        lp->av = &(crit->_av);
    } else {
        // this stream is already locked (e.g., call from hipExtLaunchMultiKernelMultiDevice)
        lp->av = &(stream->criticalData()._av);
    }
    lp->cf = nullptr;
    ihipPrintKernelLaunch(kernelNameStr, lp, stream);

    return (stream);
}


hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, grid_launch_parm* lp,
                                const char* kernelNameStr, bool lockAcquired) {
    return ihipPreLaunchKernel(stream, dim3(grid), block, lp, kernelNameStr, lockAcquired);
}


hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, grid_launch_parm* lp,
                                const char* kernelNameStr, bool lockAcquired) {
    return ihipPreLaunchKernel(stream, grid, dim3(block), lp, kernelNameStr, lockAcquired);
}


hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, grid_launch_parm* lp,
                                const char* kernelNameStr, bool lockAcquired) {
    return ihipPreLaunchKernel(stream, dim3(grid), dim3(block), lp, kernelNameStr, lockAcquired);
}


//---
// Called after kernel finishes execution.
// This releases the lock on the stream.
void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, grid_launch_parm& lp, bool unlockPostponed) {
    tprintf(DB_SYNC, "ihipPostLaunchKernel, unlocking stream\n");

    stream->lockclose_postKernelCommand(kernelName, lp.av, unlockPostponed);
    if (HIP_PROFILE_API) {
        MARKER_END();
    }
}

//=================================================================================================
// HIP API Implementation
//
// Implementor notes:
// _ All functions should call HIP_INIT_API as first action:
//    HIP_INIT_API(<function_arguments>);
//
// - ALl functions should use ihipLogStatus to return error code (not return error directly).
//=================================================================================================
//
//---

//-------------------------------------------------------------------------------------------------


const char* ihipErrorString(hipError_t hip_error) {
    switch (hip_error) {
        case hipSuccess:
            return "hipSuccess";
        case hipErrorOutOfMemory:
            return "hipErrorOutOfMemory";
        case hipErrorNotInitialized:
            return "hipErrorNotInitialized";
        case hipErrorDeinitialized:
            return "hipErrorDeinitialized";
        case hipErrorProfilerDisabled:
            return "hipErrorProfilerDisabled";
        case hipErrorProfilerNotInitialized:
            return "hipErrorProfilerNotInitialized";
        case hipErrorProfilerAlreadyStarted:
            return "hipErrorProfilerAlreadyStarted";
        case hipErrorProfilerAlreadyStopped:
            return "hipErrorProfilerAlreadyStopped";
         case hipErrorInsufficientDriver:
            return "hipErrorInsufficientDriver";
        case hipErrorInvalidImage:
            return "hipErrorInvalidImage";
        case hipErrorInvalidContext:
            return "hipErrorInvalidContext";
        case hipErrorContextAlreadyCurrent:
            return "hipErrorContextAlreadyCurrent";
        case hipErrorMapFailed:
            return "hipErrorMapFailed";
        case hipErrorUnmapFailed:
            return "hipErrorUnmapFailed";
        case hipErrorArrayIsMapped:
            return "hipErrorArrayIsMapped";
        case hipErrorAlreadyMapped:
            return "hipErrorAlreadyMapped";
        case hipErrorNoBinaryForGpu:
            return "hipErrorNoBinaryForGpu";
        case hipErrorAlreadyAcquired:
            return "hipErrorAlreadyAcquired";
        case hipErrorNotMapped:
            return "hipErrorNotMapped";
        case hipErrorNotMappedAsArray:
            return "hipErrorNotMappedAsArray";
        case hipErrorNotMappedAsPointer:
            return "hipErrorNotMappedAsPointer";
        case hipErrorECCNotCorrectable:
            return "hipErrorECCNotCorrectable";
        case hipErrorUnsupportedLimit:
            return "hipErrorUnsupportedLimit";
        case hipErrorContextAlreadyInUse:
            return "hipErrorContextAlreadyInUse";
        case hipErrorPeerAccessUnsupported:
            return "hipErrorPeerAccessUnsupported";
        case hipErrorInvalidKernelFile:
            return "hipErrorInvalidKernelFile";
        case hipErrorInvalidGraphicsContext:
            return "hipErrorInvalidGraphicsContext";
        case hipErrorInvalidSource:
            return "hipErrorInvalidSource";
        case hipErrorFileNotFound:
            return "hipErrorFileNotFound";
        case hipErrorSharedObjectSymbolNotFound:
            return "hipErrorSharedObjectSymbolNotFound";
        case hipErrorSharedObjectInitFailed:
            return "hipErrorSharedObjectInitFailed";
        case hipErrorOperatingSystem:
            return "hipErrorOperatingSystem";
        case hipErrorSetOnActiveProcess:
            return "hipErrorSetOnActiveProcess";
        case hipErrorInvalidHandle:
            return "hipErrorInvalidHandle";
        case hipErrorNotFound:
            return "hipErrorNotFound";
        case hipErrorIllegalAddress:
            return "hipErrorIllegalAddress";
        case hipErrorInvalidSymbol:
            return "hipErrorInvalidSymbol";

        case hipErrorMissingConfiguration:
            return "hipErrorMissingConfiguration";
        case hipErrorMemoryAllocation:
            return "hipErrorMemoryAllocation";
        case hipErrorInitializationError:
            return "hipErrorInitializationError";
        case hipErrorLaunchFailure:
            return "hipErrorLaunchFailure";
        case hipErrorPriorLaunchFailure:
            return "hipErrorPriorLaunchFailure";
        case hipErrorLaunchTimeOut:
            return "hipErrorLaunchTimeOut";
        case hipErrorLaunchOutOfResources:
            return "hipErrorLaunchOutOfResources";
        case hipErrorInvalidDeviceFunction:
            return "hipErrorInvalidDeviceFunction";
        case hipErrorInvalidConfiguration:
            return "hipErrorInvalidConfiguration";
        case hipErrorInvalidDevice:
            return "hipErrorInvalidDevice";
        case hipErrorInvalidValue:
            return "hipErrorInvalidValue";
        case hipErrorInvalidDevicePointer:
            return "hipErrorInvalidDevicePointer";
        case hipErrorInvalidMemcpyDirection:
            return "hipErrorInvalidMemcpyDirection";
        case hipErrorUnknown:
            return "hipErrorUnknown";
        case hipErrorInvalidResourceHandle:
            return "hipErrorInvalidResourceHandle";
        case hipErrorNotReady:
            return "hipErrorNotReady";
        case hipErrorNoDevice:
            return "hipErrorNoDevice";
        case hipErrorPeerAccessAlreadyEnabled:
            return "hipErrorPeerAccessAlreadyEnabled";

        case hipErrorPeerAccessNotEnabled:
            return "hipErrorPeerAccessNotEnabled";
        case hipErrorRuntimeMemory:
            return "hipErrorRuntimeMemory";
        case hipErrorRuntimeOther:
            return "hipErrorRuntimeOther";
        case hipErrorHostMemoryAlreadyRegistered:
            return "hipErrorHostMemoryAlreadyRegistered";
        case hipErrorHostMemoryNotRegistered:
            return "hipErrorHostMemoryNotRegistered";
        case hipErrorMapBufferObjectFailed:
            return "hipErrorMapBufferObjectFailed";
        case hipErrorAssert:
            return "hipErrorAssert";
        case hipErrorNotSupported:
            return "hipErrorNotSupported";
        case hipErrorTbd:
            return "hipErrorTbd";
        default:
            return "hipErrorUnknown";
    };
};


// Returns true if copyEngineCtx can see the memory allocated on dstCtx and srcCtx.
// The peer-list for a context controls which contexts have access to the memory allocated on that
// context. So we check dstCtx's and srcCtx's peerList to see if the both include thisCtx.
bool ihipStream_t::canSeeMemory(const ihipCtx_t* copyEngineCtx, const hc::AmPointerInfo* dstPtrInfo,
                                const hc::AmPointerInfo* srcPtrInfo) {
    if (copyEngineCtx == nullptr) {
        return false;
    }

    // Make sure this is a device-to-device copy with all memory available to the requested copy
    // engine
    //
    // TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here:
    if (dstPtrInfo->_sizeBytes == 0) {
        return false;
    } else if (dstPtrInfo->_appId != -1) {
#if USE_APP_PTR_FOR_CTX
        ihipCtx_t* dstCtx = static_cast<ihipCtx_t*>(dstPtrInfo->_appPtr);
#else
        ihipCtx_t* dstCtx = ihipGetPrimaryCtx(dstPtrInfo->_appId);
#endif
        if (copyEngineCtx != dstCtx) {
            // Only checks peer list if contexts are different
            LockedAccessor_CtxCrit_t ctxCrit(dstCtx->criticalData());
#if DB_PEER_CTX
            std::cerr << "checking peer : copyEngineCtx =" << copyEngineCtx << " dstCtx =" << dstCtx
                      << " peerCnt=" << ctxCrit->peerCnt() << "\n";
#endif
            if (!ctxCrit->isPeerWatcher(copyEngineCtx)) {
                return false;
            };
        }
    }


    // TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here:
    if (srcPtrInfo->_sizeBytes == 0) {
        return false;
    } else if (srcPtrInfo->_appId != -1) {
#if USE_APP_PTR_FOR_CTX
        ihipCtx_t* srcCtx = static_cast<ihipCtx_t*>(srcPtrInfo->_appPtr);
#else
        ihipCtx_t* srcCtx = ihipGetPrimaryCtx(srcPtrInfo->_appId);
#endif
        if (copyEngineCtx != srcCtx) {
            // Only checks peer list if contexts are different
            LockedAccessor_CtxCrit_t ctxCrit(srcCtx->criticalData());
#if DB_PEER_CTX
            std::cerr << "checking peer : copyEngineCtx =" << copyEngineCtx << " srcCtx =" << srcCtx
                      << " peerCnt=" << ctxCrit->peerCnt() << "\n";
#endif
            if (!ctxCrit->isPeerWatcher(copyEngineCtx)) {
                return false;
            };
        }
    }

    return true;
};


#define CASE_STRING(X)                                                                             \
    case X:                                                                                        \
        return #X;                                                                                 \
        break;

const char* hipMemcpyStr(unsigned memKind) {
    switch (memKind) {
        CASE_STRING(hipMemcpyHostToHost);
        CASE_STRING(hipMemcpyHostToDevice);
        CASE_STRING(hipMemcpyDeviceToHost);
        CASE_STRING(hipMemcpyDeviceToDevice);
        CASE_STRING(hipMemcpyDefault);
        default:
            return ("unknown memcpyKind");
    };
}

const char* hcMemcpyStr(hc::hcCommandKind memKind) {
    using namespace hc;
    switch (memKind) {
        CASE_STRING(hcMemcpyHostToHost);
        CASE_STRING(hcMemcpyHostToDevice);
        CASE_STRING(hcMemcpyDeviceToHost);
        CASE_STRING(hcMemcpyDeviceToDevice);
        // CASE_STRING(hcMemcpyDefault);
        default:
            return ("unknown memcpyKind");
    };
}


// Resolve hipMemcpyDefault to a known type.
unsigned ihipStream_t::resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem) {
    hipMemcpyKind kind = hipMemcpyDefault;

    if (srcInDeviceMem && dstInDeviceMem) {
        kind = hipMemcpyDeviceToDevice;
    }
    if (srcInDeviceMem && !dstInDeviceMem) {
        kind = hipMemcpyDeviceToHost;
    }
    if (!srcInDeviceMem && !dstInDeviceMem) {
        kind = hipMemcpyHostToHost;
    }
    if (!srcInDeviceMem && dstInDeviceMem) {
        kind = hipMemcpyHostToDevice;
    }

    assert(kind != hipMemcpyDefault);
    return kind;
}


// hipMemKind must be "resolved" to a specific direction - cannot be default.
void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind,
                                            const hc::AmPointerInfo* dstPtrInfo,
                                            const hc::AmPointerInfo* srcPtrInfo,
                                            hc::hcCommandKind* hcCopyDir, ihipCtx_t** copyDevice,
                                            bool* forceUnpinnedCopy) {
    // Ignore what the user tells us and always resolve the direction:
    // Some apps apparently rely on this.
    hipMemKind = resolveMemcpyDirection(srcPtrInfo->_isInDeviceMem, dstPtrInfo->_isInDeviceMem);


    switch (hipMemKind) {
        case hipMemcpyHostToHost:
            *hcCopyDir = hc::hcMemcpyHostToHost;
            break;
        case hipMemcpyHostToDevice:
            *hcCopyDir = hc::hcMemcpyHostToDevice;
            break;
        case hipMemcpyDeviceToHost:
            *hcCopyDir = hc::hcMemcpyDeviceToHost;
            break;
        case hipMemcpyDeviceToDevice:
            *hcCopyDir = hc::hcMemcpyDeviceToDevice;
            break;
        default:
            throw ihipException(hipErrorRuntimeOther);
    };

    if (srcPtrInfo->_isInDeviceMem) {
        *copyDevice = ihipGetPrimaryCtx(srcPtrInfo->_appId);
    } else if (dstPtrInfo->_isInDeviceMem) {
        *copyDevice = ihipGetPrimaryCtx(dstPtrInfo->_appId);
    } else {
        *copyDevice = nullptr;
    }

    *forceUnpinnedCopy = false;
    if (canSeeMemory(*copyDevice, dstPtrInfo, srcPtrInfo)) {
        if (HIP_FORCE_P2P_HOST & 0x1) {
            *forceUnpinnedCopy = true;
            tprintf(DB_COPY,
                    "Copy engine (dev:%d agent=0x%lx) can see src and dst but "
                    "HIP_FORCE_P2P_HOST=0, forcing copy through staging buffers.\n",
                    *copyDevice ? (*copyDevice)->getDeviceNum() : -1,
                    *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0);

        } else {
            tprintf(DB_COPY, "Copy engine (dev:%d agent=0x%lx) can see src and dst.\n",
                    *copyDevice ? (*copyDevice)->getDeviceNum() : -1,
                    *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0);
        }
    } else {
        *forceUnpinnedCopy = true;
        tprintf(DB_COPY,
                "Copy engine(dev:%d agent=0x%lx) cannot see both host and device pointers - "
                "forcing copy with unpinned engine.\n",
                *copyDevice ? (*copyDevice)->getDeviceNum() : -1,
                *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0);
        if (HIP_FAIL_SOC & 0x2) {
            fprintf(stderr,
                    "HIP_FAIL_SOC:  P2P: copy engine(dev:%d agent=0x%lx) cannot see both host and "
                    "device pointers - forcing copy with unpinned engine.\n",
                    *copyDevice ? (*copyDevice)->getDeviceNum() : -1,
                    *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0);
            throw ihipException(hipErrorRuntimeOther);
        }
    }
}


void printPointerInfo(unsigned dbFlag, const char* tag, const void* ptr,
                      const hc::AmPointerInfo& ptrInfo) {
    tprintf(dbFlag,
            "  %s=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d "
            "registered=%d allocSeqNum=%zu, appAllocationFlags=%x, appPtr=%p\n",
            tag, ptr, ptrInfo._hostPointer, ptrInfo._devicePointer, ptrInfo._sizeBytes,
            ptrInfo._appId, ptrInfo._sizeBytes != 0, ptrInfo._isInDeviceMem, !ptrInfo._isAmManaged,
            ptrInfo._allocSeqNum, ptrInfo._appAllocationFlags, ptrInfo._appPtr);
}


// the pointer-info as returned by HC refers to the allocation
// This routine modifies the pointer-info so it appears to refer to the specific ptr and sizeBytes.
// TODO -remove this when HCC uses HSA pointer info functions directly.
void tailorPtrInfo(hc::AmPointerInfo* ptrInfo, const void* ptr, size_t sizeBytes) {
    const char* ptrc = static_cast<const char*>(ptr);
    if (ptrInfo->_sizeBytes == 0) {
        // invalid ptrInfo, don't modify
        return;
    } else if (ptrInfo->_isInDeviceMem) {
        assert(ptrInfo->_devicePointer != nullptr);
        std::ptrdiff_t diff = ptrc - static_cast<const char*>(ptrInfo->_devicePointer);

        // TODO : assert-> runtime assert that only appears in debug mode
        assert(diff >= 0);
        assert(diff <= ptrInfo->_sizeBytes);

        ptrInfo->_devicePointer = const_cast<void*>(ptr);

        if (ptrInfo->_hostPointer != nullptr) {
            ptrInfo->_hostPointer = static_cast<char*>(ptrInfo->_hostPointer) + diff;
        }

    } else {
        assert(ptrInfo->_hostPointer != nullptr);
        std::ptrdiff_t diff = ptrc - static_cast<const char*>(ptrInfo->_hostPointer);

        // TODO : assert-> runtime assert that only appears in debug mode
        assert(diff >= 0);
        assert(diff <= ptrInfo->_sizeBytes);

        ptrInfo->_hostPointer = const_cast<void*>(ptr);

        if (ptrInfo->_devicePointer != nullptr) {
            ptrInfo->_devicePointer = static_cast<char*>(ptrInfo->_devicePointer) + diff;
        }
    }

    assert(sizeBytes <= ptrInfo->_sizeBytes);
    ptrInfo->_sizeBytes = sizeBytes;
};


bool getTailoredPtrInfo(const char* tag, hc::AmPointerInfo* ptrInfo, const void* ptr,
                        size_t sizeBytes) {
    bool tracked = (hc::am_memtracker_getinfo(ptrInfo, ptr) == AM_SUCCESS);
    printPointerInfo(DB_COPY, tag, ptr, *ptrInfo);

    if (tracked) {
        tailorPtrInfo(ptrInfo, ptr, sizeBytes);
        printPointerInfo(DB_COPY, "    mod", ptr, *ptrInfo);
    }

    return tracked;
};


// TODO : For registered and host memory, if the portable flag is set, we need to recognize that and
// perform appropriate copy operation. What can happen now is that Portable memory is mapped into
// multiple devices but Peer access is not enabled. i The peer detection logic doesn't see that the
// memory is already mapped and so tries to use an unpinned copy algorithm.  If this is PinInPlace,
// then an error can occur. Need to track Portable flag correctly or use new RT functionality to
// query the peer status for the pointer.
//
// TODO - remove kind parm from here or use it below?
void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind,
                                   bool resolveOn) {
    ihipCtx_t* ctx = this->getCtx();
    const ihipDevice_t* device = ctx->getDevice();

    if (device == NULL) {
        throw ihipException(hipErrorInvalidDevice);
    }

    hc::accelerator acc;
#if (__hcc_workweek__ >= 17332)
    hc::AmPointerInfo dstPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0);
    hc::AmPointerInfo srcPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0);
#else
    hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0);
    hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0);
#endif
    bool dstTracked = getTailoredPtrInfo("    dst", &dstPtrInfo, dst, sizeBytes);
    bool srcTracked = getTailoredPtrInfo("    src", &srcPtrInfo, src, sizeBytes);


    // Some code in HCC and in printPointerInfo uses _sizeBytes==0 as an indication ptr is not
    // valid, so check it here:
    if (!dstTracked) {
        assert(dstPtrInfo._sizeBytes == 0);
    }
    if (!srcTracked) {
        assert(srcPtrInfo._sizeBytes == 0);
    }


    hc::hcCommandKind hcCopyDir;
    ihipCtx_t* copyDevice;
    bool forceUnpinnedCopy;
    resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, &copyDevice,
                             &forceUnpinnedCopy);

    {
        LockedAccessor_StreamCrit_t crit(_criticalData);
        tprintf(DB_COPY,
                "copySync copyDev:%d  dst=%p (phys_dev:%d, isDevMem:%d)  src=%p(phys_dev:%d, "
                "isDevMem:%d)   sz=%zu dir=%s forceUnpinnedCopy=%d\n",
                copyDevice ? copyDevice->getDeviceNum() : -1, dst, dstPtrInfo._appId,
                dstPtrInfo._isInDeviceMem, src, srcPtrInfo._appId, srcPtrInfo._isInDeviceMem,
                sizeBytes, hcMemcpyStr(hcCopyDir), forceUnpinnedCopy);
        printPointerInfo(DB_COPY, "  dst", dst, dstPtrInfo);
        printPointerInfo(DB_COPY, "  src", src, srcPtrInfo);


        crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo,
                           copyDevice ? &copyDevice->getDevice()->_acc : nullptr,
                           forceUnpinnedCopy);
    }
}

bool ihipStream_t::locked_copy2DSync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind,
                                   bool resolveOn) {
    bool retStatus = true;
    ihipCtx_t* ctx = this->getCtx();
    const ihipDevice_t* device = ctx->getDevice();

    if (device == NULL) {
        throw ihipException(hipErrorInvalidDevice);
    }
    size_t sizeBytes = width*height;
    hc::accelerator acc;
#if (__hcc_workweek__ >= 17332)
    hc::AmPointerInfo dstPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0);
    hc::AmPointerInfo srcPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0);
#else
    hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0);
    hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0);
#endif
    bool dstTracked = getTailoredPtrInfo("    dst", &dstPtrInfo, dst, sizeBytes);
    bool srcTracked = getTailoredPtrInfo("    src", &srcPtrInfo, src, sizeBytes);

    // Some code in HCC and in printPointerInfo uses _sizeBytes==0 as an indication ptr is not
    //     // valid, so check it here:
    if (!dstTracked) {
        assert(dstPtrInfo._sizeBytes == 0);
    }
    if (!srcTracked) {
        assert(srcPtrInfo._sizeBytes == 0);
    }


    hc::hcCommandKind hcCopyDir;
    ihipCtx_t* copyDevice;
    bool forceUnpinnedCopy;
    resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, &copyDevice,
                             &forceUnpinnedCopy);

    {
        LockedAccessor_StreamCrit_t crit(_criticalData);
        tprintf(DB_COPY,
                "copy2DSync copyDev:%d  dst=%p (phys_dev:%d, isDevMem:%d)  src=%p(phys_dev:%d, "
                "isDevMem:%d)   sz=%zu dir=%s forceUnpinnedCopy=%d\n",
                copyDevice ? copyDevice->getDeviceNum() : -1, dst, dstPtrInfo._appId,
                dstPtrInfo._isInDeviceMem, src, srcPtrInfo._appId, srcPtrInfo._isInDeviceMem,
                sizeBytes, hcMemcpyStr(hcCopyDir), forceUnpinnedCopy);
        printPointerInfo(DB_COPY, "  dst", dst, dstPtrInfo);
        printPointerInfo(DB_COPY, "  src", src, srcPtrInfo);
#if (__hcc_workweek__ >= 19101)
        if(!crit->_av.copy2d_ext(src, dst, width, height, srcPitch, dstPitch, hcCopyDir, srcPtrInfo, dstPtrInfo,
                           copyDevice ? &copyDevice->getDevice()->_acc : nullptr,
                           forceUnpinnedCopy)) {
            tprintf(DB_COPY,"locked_copy2DSync failed to use SDMA\n");
            retStatus = false;
        }
#else
        crit->_av.copy2d_ext(src, dst, width, height, srcPitch, dstPitch, hcCopyDir, srcPtrInfo, dstPtrInfo,
                           copyDevice ? &copyDevice->getDevice()->_acc : nullptr,
                           forceUnpinnedCopy);
#endif
    }
    return retStatus;
}

void ihipStream_t::addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes) {
#if (__hcc_workweek__ >= 17332)
    hc::AmPointerInfo ptrInfo(NULL, ptr, ptr, sizeBytes, acc, true, false);
#else
    hc::AmPointerInfo ptrInfo(NULL, ptr, sizeBytes, acc, true, false);
#endif
    hc::am_memtracker_add(ptr, ptrInfo);
}

void ihipStream_t::lockedSymbolCopySync(hc::accelerator& acc, void* dst, void* src,
                                        size_t sizeBytes, size_t offset, unsigned kind) {
    if (kind == hipMemcpyHostToHost) {
        acc.memcpy_symbol(dst, (void*)src, sizeBytes, offset, Kalmar::hcMemcpyHostToHost);
    }
    if (kind == hipMemcpyHostToDevice) {
        acc.memcpy_symbol(dst, (void*)src, sizeBytes, offset);
    }
    if (kind == hipMemcpyDeviceToDevice) {
        acc.memcpy_symbol(dst, (void*)src, sizeBytes, offset, Kalmar::hcMemcpyDeviceToDevice);
    }
    if (kind == hipMemcpyDeviceToHost) {
        acc.memcpy_symbol((void*)src, (void*)dst, sizeBytes, offset, Kalmar::hcMemcpyDeviceToHost);
    }
}

void ihipStream_t::lockedSymbolCopyAsync(hc::accelerator& acc, void* dst, void* src,
                                         size_t sizeBytes, size_t offset, unsigned kind) {
    // TODO - review - this looks broken , should not be adding pointers to tracker dynamically:
    if (kind == hipMemcpyHostToDevice) {
#if (__hcc_workweek__ >= 17332)
        hc::AmPointerInfo srcPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0);
#else
        hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0);
#endif
        bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS);
        if (srcTracked) {
            addSymbolPtrToTracker(acc, dst, sizeBytes);
            locked_getAv()->copy_async((void*)src, dst, sizeBytes);
        } else {
            LockedAccessor_StreamCrit_t crit(_criticalData);
            this->wait(crit);
            acc.memcpy_symbol(dst, (void*)src, sizeBytes, offset);
        }
    }
    if (kind == hipMemcpyDeviceToHost) {
#if (__hcc_workweek__ >= 17332)
        hc::AmPointerInfo dstPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0);
#else
        hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0);
#endif
        bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS);
        if (dstTracked) {
            addSymbolPtrToTracker(acc, src, sizeBytes);
            locked_getAv()->copy_async((void*)src, dst, sizeBytes);
        } else {
            LockedAccessor_StreamCrit_t crit(_criticalData);
            this->wait(crit);
            acc.memcpy_symbol((void*)src, (void*)dst, sizeBytes, offset,
                              Kalmar::hcMemcpyDeviceToHost);
        }
    }
}


void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind) {
    const ihipCtx_t* ctx = this->getCtx();

    if ((ctx == nullptr) || (ctx->getDevice() == nullptr)) {
        tprintf(DB_COPY, "locked_copyAsync bad ctx or device\n");
        throw ihipException(hipErrorInvalidDevice);
    }

    if (kind == hipMemcpyHostToHost) {
        tprintf(DB_COPY, "locked_copyAsync: H2H with memcpy");

        // TODO - consider if we want to perhaps use the GPU SDMA engines anyway, to avoid the
        // host-side sync here and keep everything flowing on the GPU.
        /* As this is a CPU op, we need to wait until all
        the commands in current stream are finished.
        */
        LockedAccessor_StreamCrit_t crit(_criticalData);
        this->wait(crit);

        memcpy(dst, src, sizeBytes);

    } else {
        hc::accelerator acc;
#if (__hcc_workweek__ >= 17332)
        hc::AmPointerInfo dstPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0);
        hc::AmPointerInfo srcPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0);
#else
        hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0);
        hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0);
#endif
        tprintf(DB_COPY, "copyASync dst=%p src=%p, sz=%zu\n", dst, src, sizeBytes);
        bool dstTracked = getTailoredPtrInfo("    dst", &dstPtrInfo, dst, sizeBytes);
        bool srcTracked = getTailoredPtrInfo("    src", &srcPtrInfo, src, sizeBytes);


        hc::hcCommandKind hcCopyDir;
        ihipCtx_t* copyDevice;
        bool forceUnpinnedCopy;
        resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, &copyDevice,
                                 &forceUnpinnedCopy);
        tprintf(DB_COPY, "  copyDev:%d   dir=%s forceUnpinnedCopy=%d\n",
                copyDevice ? copyDevice->getDeviceNum() : -1, hcMemcpyStr(hcCopyDir),
                forceUnpinnedCopy);

        // "tracked" really indicates if the pointer's virtual address is available in the GPU
        // address space. If both pointers are not tracked, we need to fall back to a sync copy.
        if (dstTracked && srcTracked && !forceUnpinnedCopy &&
            copyDevice /*code below assumes this is !nullptr*/) {
            LockedAccessor_StreamCrit_t crit(_criticalData);

            // Perform fast asynchronous copy - we know copyDevice != NULL based on check above
            try {
                if (HIP_FORCE_SYNC_COPY) {
                    crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo,
                                       &copyDevice->getDevice()->_acc, forceUnpinnedCopy);

                } else {
                    crit->_av.copy_async_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo,
                                             &copyDevice->getDevice()->_acc);
                }
            } catch (Kalmar::runtime_exception) {
                throw ihipException(hipErrorRuntimeOther);
            };


            if (HIP_API_BLOCKING) {
                tprintf(DB_SYNC, "%s LAUNCH_BLOCKING for completion of hipMemcpyAsync(sz=%zu)\n",
                        ToString(this).c_str(), sizeBytes);
                this->wait(crit);
            }

        } else {
            if (HIP_FAIL_SOC & 0x1) {
                fprintf(stderr,
                        "HIP_FAIL_SOC failed, async_copy requested but could not be completed "
                        "since src or dst not accesible to copy agent\n");
                fprintf(stderr,
                        "copyASync copyDev:%d  dst=%p (phys_dev:%d, isDevMem:%d)  "
                        "src=%p(phys_dev:%d, isDevMem:%d)   sz=%zu dir=%s forceUnpinnedCopy=%d\n",
                        copyDevice ? copyDevice->getDeviceNum() : -1, dst, dstPtrInfo._appId,
                        dstPtrInfo._isInDeviceMem, src, srcPtrInfo._appId,
                        srcPtrInfo._isInDeviceMem, sizeBytes, hcMemcpyStr(hcCopyDir),
                        forceUnpinnedCopy);
                fprintf(
                    stderr,
                    "  dst=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n",
                    dst, dstPtrInfo._hostPointer, dstPtrInfo._devicePointer, dstPtrInfo._sizeBytes,
                    dstPtrInfo._appId, dstTracked, dstPtrInfo._isInDeviceMem);
                fprintf(
                    stderr,
                    "  src=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d\n",
                    src, srcPtrInfo._hostPointer, srcPtrInfo._devicePointer, srcPtrInfo._sizeBytes,
                    srcPtrInfo._appId, srcTracked, srcPtrInfo._isInDeviceMem);
                throw ihipException(hipErrorRuntimeOther);
            }
            // Perform slow synchronous copy:
            LockedAccessor_StreamCrit_t crit(_criticalData);


            crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo,
                               copyDevice ? &copyDevice->getDevice()->_acc : nullptr,
                               forceUnpinnedCopy);
        }
    }
}

bool ihipStream_t::locked_copy2DAsync(void* dst, const void* src, size_t width, size_t height, size_t srcPitch, size_t dstPitch, unsigned kind)
{
    bool retStatus = true;
    const ihipCtx_t* ctx = this->getCtx();

    if ((ctx == nullptr) || (ctx->getDevice() == nullptr)) {
        tprintf(DB_COPY, "locked_copy2DAsync bad ctx or device\n");
        throw ihipException(hipErrorInvalidDevice);
    }
    hc::accelerator acc;
    size_t sizeBytes = width*height;
#if (__hcc_workweek__ >= 17332)
    hc::AmPointerInfo dstPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0);
    hc::AmPointerInfo srcPtrInfo(NULL, NULL, NULL, 0, acc, 0, 0);
#else
    hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0);
    hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0);
#endif
    tprintf(DB_COPY, "copy2DAsync dst=%p src=%p, sz=%zu\n", dst, src, sizeBytes);
    bool dstTracked = getTailoredPtrInfo("    dst", &dstPtrInfo, dst, sizeBytes);
    bool srcTracked = getTailoredPtrInfo("    src", &srcPtrInfo, src, sizeBytes);


    hc::hcCommandKind hcCopyDir;
    ihipCtx_t* copyDevice;
    bool forceUnpinnedCopy;
    resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, &copyDevice,
                             &forceUnpinnedCopy);
    tprintf(DB_COPY, "  copyDev:%d   dir=%s forceUnpinnedCopy=%d\n",
            copyDevice ? copyDevice->getDeviceNum() : -1, hcMemcpyStr(hcCopyDir),
               forceUnpinnedCopy);
    if (dstTracked && srcTracked && !forceUnpinnedCopy &&
        copyDevice /*code below assumes this is !nullptr*/) {
        LockedAccessor_StreamCrit_t crit(_criticalData);

        try {
             if (HIP_FORCE_SYNC_COPY) {
#if (__hcc_workweek__ >= 19101)
                 if(!crit->_av.copy2d_ext(src, dst, width, height, srcPitch, dstPitch, hcCopyDir, srcPtrInfo, dstPtrInfo,
                           &copyDevice->getDevice()->_acc,
                           forceUnpinnedCopy)){
                     tprintf(DB_COPY,"locked_copy2DASync with HIP_FORCE_SYNC_COPY failed to use SDMA\n");
                     retStatus = false;
                 }
#else
                 crit->_av.copy2d_ext(src, dst, width, height, srcPitch, dstPitch, hcCopyDir, srcPtrInfo, dstPtrInfo,
                           &copyDevice->getDevice()->_acc,
                           forceUnpinnedCopy);
#endif

             } else {
                 const auto& future = crit->_av.copy2d_async_ext(src, dst, width, height, srcPitch, dstPitch, hcCopyDir, srcPtrInfo, dstPtrInfo,
                                          &copyDevice->getDevice()->_acc);
                 if(!future.valid()) {
                     tprintf(DB_COPY, "locked_copy2DAsync failed to use SDMA\n");
                     retStatus = false;
                 }
             }
         } catch (Kalmar::runtime_exception) {
                throw ihipException(hipErrorRuntimeOther);
         };

         if (HIP_API_BLOCKING) {
             tprintf(DB_SYNC, "%s LAUNCH_BLOCKING for completion of hipMemcpy2DAsync(sz=%zu)\n",
                        ToString(this).c_str(), sizeBytes);
             this->wait(crit);
         }

    } else {
         //Do sync 2D copy
         LockedAccessor_StreamCrit_t crit(_criticalData);
#if (__hcc_workweek__ >= 19101)
         if(!crit->_av.copy2d_ext(src, dst, width, height, srcPitch, dstPitch, hcCopyDir, srcPtrInfo, dstPtrInfo,
                           copyDevice ? &copyDevice->getDevice()->_acc : nullptr,
                           forceUnpinnedCopy)){
             tprintf(DB_COPY, "locked_copy2DAsync Sync copy failed to use SDMA\n");
             retStatus = false;
         }
#else
         crit->_av.copy2d_ext(src, dst, width, height, srcPitch, dstPitch, hcCopyDir, srcPtrInfo, dstPtrInfo,
                           copyDevice ? &copyDevice->getDevice()->_acc : nullptr,
                           forceUnpinnedCopy);
#endif
    }
    return retStatus;
}

//-------------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------
// Profiler, really these should live elsewhere:
hipError_t hipProfilerStart() {
    HIP_INIT_API(hipProfilerStart);
#if COMPILE_HIP_ATP_MARKER
    amdtResumeProfiling(AMDT_ALL_PROFILING);
#endif

    return ihipLogStatus(hipSuccess);
};


hipError_t hipProfilerStop() {
    HIP_INIT_API(hipProfilerStop);
#if COMPILE_HIP_ATP_MARKER
    amdtStopProfiling(AMDT_ALL_PROFILING);
#endif

    return ihipLogStatus(hipSuccess);
};


//-------------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------
// HCC-specific accessor functions:

//---
hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator* acc) {
    HIP_INIT_API(hipHccGetAccelerator, deviceId, acc);

    const ihipDevice_t* device = ihipGetDevice(deviceId);
    hipError_t err;
    if (device == NULL) {
        err = hipErrorInvalidDevice;
    } else {
        *acc = device->_acc;
        err = hipSuccess;
    }
    return ihipLogStatus(err);
}


//---
hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view** av) {
    HIP_INIT_API(hipHccGetAcceleratorView, stream, av);

    if (stream == hipStreamNull) {
        ihipCtx_t* device = ihipGetTlsDefaultCtx();
        stream = device->_defaultStream;
    }

    *av = stream->locked_getAv();  // TODO - review.

    hipError_t err = hipSuccess;
    return ihipLogStatus(err);
}

//// TODO - add identifier numbers for streams and devices to help with debugging.
// TODO - add a contect sequence number for debug. Print operator<< ctx:0.1 (device.ctx)

namespace hip_impl {
    std::vector<hsa_agent_t> all_hsa_agents() {
        std::vector<hsa_agent_t> r{};
        std::vector<hc::accelerator> visible_accelerators;
        for (int i=0; i < g_deviceCnt; i++)
            visible_accelerators.push_back(g_deviceArray[i]->_acc);
        for (auto&& acc : visible_accelerators) {
            const auto agent = acc.get_hsa_agent();

            if (!agent || !acc.is_hsa_accelerator()) continue;

            r.emplace_back(*static_cast<hsa_agent_t*>(agent));
        }
        return r;
    }

    [[noreturn]]
    void hip_throw(const std::exception& ex) {
        #if defined(__cpp_exceptions)
            if (auto rte = dynamic_cast<const std::runtime_error*>(&ex)) throw *rte;
            if (auto lge = dynamic_cast<const std::logic_error*>(&ex)) throw *lge;
            throw ex;
        #else
            std::cerr << ex.what() << std::endl;
            std::terminate();
        #endif
    }

} // Namespace hip_impl.
